# Install Library

[RDKit ](https://github.com/rdkit/rdkit)

[DGL](https://github.com/dmlc/dgl/)

[DGL-LifeSci](https://github.com/awslabs/dgl-lifesci)





In [None]:
%%capture
!pip install rdkit-pypi
!pip install dgllife
!pip install --pre dgl-cu113 dglgo -f https://data.dgl.ai/wheels-test/repo.html

# Import Library

In [None]:
import os

import dgl
import sys
import torch
import random
import cv2
import statistics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch.optim as optim

from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit import DataStructs

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import  History
from dgllife.utils import smiles_to_bigraph, CanonicalAtomFeaturizer, AttentiveFPAtomFeaturizer
from sklearn.model_selection import train_test_split

from utils.general import DATASET, get_dataset, separate_active_and_inactive_data, get_embedding_vector_class, count_lablel,data_generator
from utils.gcn_pre_trained import get_sider_model
from utils.special_functions import is_Membership

from model.heterogeneous_siamese_sider import siamese_model_attentiveFp_sider

device = torch.device('cpu' if torch.cuda.is_available() else 'cpu')

# Data

In [None]:
cache_path_tox21='./tox21_dglgraph.bin'

df_tox21 = get_dataset("tox21")
ids = df_tox21['mol_id']

df_tox21 = df_tox21.drop(columns=['mol_id'])

In [None]:
cache_path_sider='./sider_dglgraph.bin'

df = get_dataset("sider")

In [None]:
tox21_tasks = df.columns.values[:12].tolist()

In [None]:
tox21_smiles = np.array(df_tox21['smiles'])
sider_smiles = np.array(df_sider['smiles'])

In [None]:
subscriber = []
for ts in tox21_smiles:
    for ss in sider_smiles:
        if ts == ss:
            subscriber.append(ts)

In [None]:
subscriber

['CC(O)(P(=O)(O)O)P(=O)(O)O',
 'C[N+](C)(C)CC(=O)[O-]',
 'C[N+](C)(C)CCO',
 'CC(=O)NO',
 'CC(=O)OCC[N+](C)(C)C',
 'CC(=O)[O-].[Na+]',
 'CCCC(CCC)C(=O)O',
 'Cl[Zn]Cl',
 'CN(CCCl)CCCl',
 'C[N+](C)(C)CCOC(=O)CCC(=O)OCC[N+](C)(C)C',
 'C1N2CN3CN1CN(C2)C3',
 'CCN(CC)C(=S)SSC(=S)N(CC)CC']

# Required functions

In [None]:
def create_dataset_with_gcn_case_study(dataset, class_embed_vector, GCN, tasks):
    created_data = []
    data = np.arange(len(tasks))
    onehot_encoded = to_categorical(data)
    for i, data in enumerate(dataset):
        smiles, g, labels, mask = data
        g = g.to(device)
        g = dgl.add_self_loop(g)
        graph_feats = g.ndata.pop('h')
        embbed = GCN(g, graph_feats)
        embbed = embbed.to('cpu')
        embbed = embbed.detach().numpy()
        for j, label in enumerate(labels):
            a = (smiles, embbed, onehot_encoded[j], class_embed_vector[j], labels[j], tasks[j])
            created_data.append(a)
    print('Data created!!')
    return created_data


def create_dataset_with_gcn(dataset, subscriber, class_embed_vector, GCN, tasks, numberTask):

    created_data = []
    created_subscriber = []
    data = np.arange(len(tasks))
    onehot_encoded = to_categorical(data)

    for i, data in enumerate(dataset):
        smiles, g, label, mask = data
#         g = g.to(device)
        g = dgl.add_self_loop(g)
        graph_feats = g.ndata.pop('h')
        embbed = GCN(g, graph_feats)
        embbed = embbed.to('cpu')
        embbed = embbed.detach().numpy()
        a = (smiles, embbed, onehot_encoded[numberTask], class_embed_vector[numberTask], label, tasks[numberTask])
        if smiles in subscriber:
            created_subscriber.append(data)
        else:
            created_data.append(a)
    print('Data created!!')
    return created_data, created_subscriber


# Calculation of embedded vectors for each class

In [None]:
df_positive, df_negative = Separate_active_and_inactive_data(df_tox21, tox21_tasks)

for i,d in enumerate(zip(df_positive,df_negative)):
    print(f'{tox21_tasks[i]}=> positive: {len(d[0])} - negative: {len(d[1])}')

NR-AR=> positive: 309 - negative: 6956
NR-AR-LBD=> positive: 237 - negative: 6521
NR-AhR=> positive: 768 - negative: 5781
NR-Aromatase=> positive: 300 - negative: 5521
NR-ER=> positive: 793 - negative: 5400
NR-ER-LBD=> positive: 350 - negative: 6605
NR-PPAR-gamma=> positive: 186 - negative: 6264
SR-ARE=> positive: 942 - negative: 4890
SR-ATAD5=> positive: 264 - negative: 6808
SR-HSE=> positive: 372 - negative: 6095
SR-MMP=> positive: 918 - negative: 4892
SR-p53=> positive: 423 - negative: 6351


In [None]:
dataset_positive = [DATASET(d,smiles_to_bigraph, AttentiveFPAtomFeaturizer(), cache_file_path = cache_path) for d in df_positive]
dataset_negative = [DATASET(d,smiles_to_bigraph, AttentiveFPAtomFeaturizer(), cache_file_path = cache_path) for d in df_negative]

Processing dgl graphs from scratch...
Processing dgl graphs from scratch...
Processing dgl graphs from scratch...
Processing dgl graphs from scratch...
Processing dgl graphs from scratch...
Processing dgl graphs from scratch...
Processing dgl graphs from scratch...
Processing dgl graphs from scratch...
Processing dgl graphs from scratch...
Processing dgl graphs from scratch...
Processing dgl graphs from scratch...
Processing dgl graphs from scratch...
Processing dgl graphs from scratch...
Processing molecule 1000/6956
Processing molecule 2000/6956
Processing molecule 3000/6956
Processing molecule 4000/6956
Processing molecule 5000/6956
Processing molecule 6000/6956
Processing dgl graphs from scratch...
Processing molecule 1000/6521
Processing molecule 2000/6521
Processing molecule 3000/6521
Processing molecule 4000/6521
Processing molecule 5000/6521
Processing molecule 6000/6521
Processing dgl graphs from scratch...
Processing molecule 1000/5781
Processing molecule 2000/5781
Processing

In [None]:
embed_class_tox21 = get_embedding_vector_class(dataset_positive, dataset_negative, subscriber, radius=2, size = 512)

class vector created!!


# Transfer Learning with BioAct-Het and AttentiveFp GCN

In [None]:
model_name = 'GCN_attentivefp_SIDER'
gcn_model = get_sider_model(model_name)
gcn_model.eval()
# gcn_model = gcn_model.to(device)

Downloading GCN_attentivefp_SIDER_pre_trained.pth from https://data.dgl.ai/dgllife/pre_trained/gcn_attentivefp_sider.pth...
Pretrained model loaded


GCNPredictor(
  (gnn): GCN(
    (gnn_layers): ModuleList(
      (0): GCNLayer(
        (graph_conv): GraphConv(in=39, out=256, normalization=none, activation=<function relu at 0x000002844CF3C1F8>)
        (dropout): Dropout(p=0.08333992387843633, inplace=False)
        (bn_layer): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): GCNLayer(
        (graph_conv): GraphConv(in=256, out=256, normalization=none, activation=<function relu at 0x000002844CF3C1F8>)
        (dropout): Dropout(p=0.08333992387843633, inplace=False)
        (bn_layer): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (2): GCNLayer(
        (graph_conv): GraphConv(in=256, out=256, normalization=none, activation=<function relu at 0x000002844CF3C1F8>)
        (dropout): Dropout(p=0.08333992387843633, inplace=False)
        (bn_layer): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
 

In [None]:
data_ds = []
subscriber_data_ds = []
for i, task in  enumerate(tox21_tasks):
    a = df_tox21[['smiles' , task]]
    a = a.dropna()
    ds = DATASET(a,smiles_to_bigraph, AttentiveFPAtomFeaturizer(), cache_file_path = cache_path_sider) 
    data, subscriber_data = create_dataset_with_gcn(ds, subscriber, embed_class_tox21, gcn_model, tox21_tasks, i)
    for d in data:
        data_ds.append(d)
    for d in subscriber_data:
        subscriber_data_ds.append(d)

Processing dgl graphs from scratch...
Processing molecule 1000/7265
Processing molecule 2000/7265
Processing molecule 3000/7265
Processing molecule 4000/7265
Processing molecule 5000/7265
Processing molecule 6000/7265
Processing molecule 7000/7265
Data created!!
Processing dgl graphs from scratch...
Processing molecule 1000/6758
Processing molecule 2000/6758
Processing molecule 3000/6758
Processing molecule 4000/6758
Processing molecule 5000/6758
Processing molecule 6000/6758
Data created!!
Processing dgl graphs from scratch...
Processing molecule 1000/6549
Processing molecule 2000/6549
Processing molecule 3000/6549
Processing molecule 4000/6549
Processing molecule 5000/6549
Processing molecule 6000/6549
Data created!!
Processing dgl graphs from scratch...
Processing molecule 1000/5821
Processing molecule 2000/5821
Processing molecule 3000/5821
Processing molecule 4000/5821
Processing molecule 5000/5821
Data created!!
Processing dgl graphs from scratch...
Processing molecule 1000/6193


In [None]:
from sklearn.model_selection import KFold

Epoch_S = 10

def evaluate_model(dataset, subscriber_dataset, k = 10 , shuffle = False):
    result =[]

    kf = KFold(n_splits=10, shuffle= shuffle, random_state=None)

    for train_index, test_index in kf.split(dataset):

        train_ds = [dataset[index] for index in train_index]

        valid_ds = [dataset[index] for index in test_index]

        label_pos , label_neg, _ , _ = count_lablel(train_ds)
        print(f'train positive label: {label_pos} - train negative label: {label_neg}')

        train_ds = up_and_down_Samplenig(train_ds, scale_downsampling = 0.5)

        label_pos , label_neg , _ , _ = count_lablel(train_ds)
        print(f'up and down sampling => train positive label: {label_pos} - train negative label: {label_neg}')

        label_pos , label_neg, _ , _ = count_lablel(valid_ds)
        print(f'Test positive label: {label_pos} - Test negative label: {label_neg}')

        l_train = []
        r_train = []
        lbls_train = []
        l_valid = []
        r_valid = []
        lbls_valid = []

        for i , data in enumerate(train_ds):
            smiles, embbed_drug, onehot_task, embbed_task, lbl, task_name = data
            l_train.append(embbed_drug[0])
            r_train.append(embbed_task)
            lbls_train.append(lbl.tolist())

        for i , data in enumerate(valid_ds):
            smiles, embbed_drug, onehot_task, embbed_task, lbl, task_name = data
            l_valid.append(embbed_drug[0])
            r_valid.append(embbed_task)
            lbls_valid.append(lbl.tolist())

        l_train = np.array(l_train).reshape(-1,1024,1)
        r_train = np.array(r_train).reshape(-1,512,1)
        lbls_train = np.array(lbls_train)

        l_valid = np.array(l_valid).reshape(-1,1024,1)
        r_valid = np.array(r_valid).reshape(-1,512,1)
        lbls_valid = np.array(lbls_valid)

        # create neural network model
        siamese_net = siamese_model_attentiveFp_sider()
        
        history = History()
        P = siamese_net.fit([l_train, r_train], lbls_train, epochs = Epoch_S, batch_size = 128, callbacks=[history])

        for j in range(100):
            C=1
            Before = int(P.history['accuracy'][-1]*100)
            for i in range(2,Epoch_S+1):
                if  int(P.history['accuracy'][-i]*100) == Before:
                    C=C+1
                else:
                    C=1
                Before=int(P.history['accuracy'][-i]*100)
                print(Before)
            if C==Epoch_S:
                break
            P = siamese_net.fit([l_train, r_train], lbls_train, epochs = Epoch_S, batch_size = 128, callbacks=[history])
        print(j+1)

        score  = siamese_net.evaluate([l_valid,r_valid], lbls_valid, verbose=1)
        a = (score[1],score[4])
        result.append(a)

    return result

scores = evaluate_model(data_ds, subscriber_data_ds, 10, True)

train positive label: 5273 - train negative label: 64774
up and down sampling => train positive label: 47457 - train negative label: 64774
Test positive label: 580 - Test negative label: 7203
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
85
85
84
83
82
81
80
78
71
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
89
89
88
88
88
88
87
87
86
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
91
91
91
91
91
91
90
90
90
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
93
92
92
92
92
92
92
92
92
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
93
93
93
93
93
93
93
93
93
5
train positive label: 5266 - train negative label: 64781
up and down sampling => train positive label: 47394 - train

Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
94
94
94
93
94
94
94
94
94
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
94
94
94
94
94
94
94
94
94
7
train positive label: 5291 - train negative label: 64756
up and down sampling => train positive label: 47619 - train negative label: 64756
Test positive label: 562 - Test negative label: 7221
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
85
85
84
83
82
81
80
78
71
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
89
89
89
88
88
87
87
87
86
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
91
91
91
91
91
90
90
90
90
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
93
93
93
92
92
92
92
92
92
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


91
91
91
91
90
91
90
90
90
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
92
92
92
92
92
92
91
91
91
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
93
93
93
93
93
92
92
92
92
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
93
93
93
93
93
93
93
93
93
6
train positive label: 5281 - train negative label: 64766
up and down sampling => train positive label: 47529 - train negative label: 64766
Test positive label: 572 - Test negative label: 7211
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
86
85
84
84
82
81
80
78
70
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
89
89
89
89
88
88
88
87
87
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
E

Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
85
85
84
83
82
81
80
78
71
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
89
89
89
88
88
87
87
87
86
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
91
91
90
90
90
90
90
90
89
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
92
92
92
92
92
91
91
91
91
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
93
93
93
93
92
92
92
92
92
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
94
93
93
93
93
93
93
93
93
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
94
94
94
94
94
94
94
94
93
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
E

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
92
92
92
92
91
91
91
91
91
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
93
93
92
92
92
92
92
92
92
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
93
93
93
93
93
93
93
93
93
6
train positive label: 5292 - train negative label: 64755
up and down sampling => train positive label: 47628 - train negative label: 64755
Test positive label: 561 - Test negative label: 7222
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
85
85
84
83
82
81
80
78
70
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
89
89
89
88
88
88
87
87
86
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
91
91
91
91
91
9

Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
90
90
90
89
89
88
88
88
87
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
92
92
92
91
91
91
91
91
91
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
93
93
93
93
93
92
92
92
92
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
93
93
93
94
93
93
93
93
93
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
94
94
94
94
94
94
94
94
94
6
train positive label: 5229 - train negative label: 64818
up and down sampling => train positive label: 47061 - train negative label: 64818
Test positive label: 624 - Test negative label: 7159
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
86
86
85
84
83
82
80
78
71
Epoch 1/10
Epoch 2/10
Epoch 3/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
94
94
94
94
94
94
94
94
94
7


#### Dropout = 0.3 and downsampling = 0.5

In [None]:
scores

[(0.909417986869812, 0.8909998536109924),
 (0.9104458689689636, 0.8691753149032593),
 (0.913144052028656, 0.8680707812309265),
 (0.9166131019592285, 0.8781477212905884),
 (0.920724630355835, 0.861714243888855),
 (0.9145573973655701, 0.8755292892456055),
 (0.9218810200691223, 0.8655411601066589),
 (0.9247077107429504, 0.8602591156959534),
 (0.922137975692749, 0.8681714534759521),
 (0.9125016331672668, 0.873445987701416)]

In [None]:
acc = []
auc = []
for i in scores:
    acc.append(i[0])
    auc.append(i[1])

print(f'accuracy= {np.mean(acc)} AUC= {np.mean(auc)} STD_AUC= {np.std(auc)}')

accuracy= 0.9166131377220154 AUC= 0.8711054921150208 STD_AUC= 0.008536810669444152


# **Case study with BioAct-Het**

In [None]:
model_name = 'GCN_attentivefp_SIDER'
gcn_model = get_sider_model(model_name)
gcn_model.eval()
gcn_model = gcn_model.to(device)

In [None]:
sider_smiles = df.smiles.to_numpy()

In [None]:
dir_path = 'C:/Users/Ali/Desktop/thesis'

In [None]:
df_case_study = pd.read_csv(dir_path + '/(sider)case_study.csv')

In [None]:
df_case_study

Unnamed: 0.1,Unnamed: 0,smiles,Hepatobiliary disorders,Metabolism and nutrition disorders,Product issues,Eye disorders,Investigations,Musculoskeletal and connective tissue disorders,Gastrointestinal disorders,Social circumstances,...,Infections and infestations,"Respiratory, thoracic and mediastinal disorders",Psychiatric disorders,Renal and urinary disorders,"Pregnancy, puerperium and perinatal conditions",Ear and labyrinth disorders,Cardiac disorders,Nervous system disorders,"Injury, poisoning and procedural complications",Drug_Name
0,0,Cl.CN[C@H](CC(C)C)C(=O)N[C@@H]1[C@H](O)C2=CC=C...,,,,,,,,,...,,,,,,,,,,Vancomycin
1,1,CC[C@@H]1NC(=O)[C@H]([C@H](O)[C@H](C)C\C=C\C)N...,,,,,,,,,...,,,,,,,,,,cyclosporine
2,2,Cl.CCCCCCCCC1=CC=C(CCC(N)(CO)CO)C=C1,,,,,,,,,...,,,,,,,,,,fingolimod
3,3,CC(C)CC(C(=NC(CCC(=O)O)C(=NC(CCCCN)C(=NC(CCC(=...,,,,,,,,,...,,,,,,,,,,interferon-beta 1a
4,4,CCC(CC)COC(=O)[C@H](C)N[P@](=O)(OC[C@H]1O[C@](...,,,,,,,,,...,,,,,,,,,,Remdesivir
5,5,CCOC(=O)C1=C[C@@H](OC(CC)CC)[C@H](NC(C)=O)[C@@...,,,,,,,,,...,,,,,,,,,,Oseltamivir
6,6,CC(C)[C@H](NC(=O)N(C)CC1=CSC(=N1)C(C)C)C(=O)N[...,,,,,,,,,...,,,,,,,,,,Ritonavir
7,7,CC(C)C(=O)OC[C@H]1O[C@H]([C@H](O)[C@@H]1O)N1C=...,,,,,,,,,...,,,,,,,,,,Molnupiravir
8,8,CC1(C2C1C(N(C2)C(=O)C(C(C)(C)C)NC(=O)C(F)(F)F)...,,,,,,,,,...,,,,,,,,,,Paxlovid
9,9,CCC(N[C@H]1C2=CN=CC(C3=CC=C(N(C(CC4)=O)C)C4=C3...,,,,,,,,,...,,,,,,,,,,Baxdrostat


In [None]:
drug_name = df_case_study.Drug_Name.to_numpy()

In [None]:
candidate_smiles = df_case_study.smiles.to_numpy()

In [None]:
is_Membership(sider_smiles, candidate_smiles)

False

In [None]:
dataset = DATASET(df,smiles_to_bigraph, AttentiveFPAtomFeaturizer(), cache_file_path = cache_path_sider) 
ds_train = create_dataset_with_gcn_case_study(dataset, embed_class_sider, gcn_model, sider_tasks)

Processing dgl graphs from scratch...
Processing molecule 1000/1427
Data created!!


In [None]:
dataset_study = DATASET(df_case_study[df_case_study.columns[1:29]],smiles_to_bigraph, 
                        AttentiveFPAtomFeaturizer(), cache_file_path = cache_path_sider)

ds_study = create_dataset_with_gcn(dataset_study, embed_class_sider, gcn_model, sider_tasks)

Processing dgl graphs from scratch...
Data created!!


In [None]:
len(ds_study)

351

### Training algorithm

In [None]:
Epoch_S = 15

l, r , lbls = data_generator(ds_train)

l = np.array(l).reshape(-1,1024,1)
r = np.array(r).reshape(-1,512,1)
lbls=np.array(lbls)

history = History()

siamese_net = siamese_model_attentiveFp_sider()


s = siamese_net.fit([l, r], lbls, epochs = Epoch_S, shuffle=True, batch_size=128, callbacks=[history])

for j in range(1000):
    C=1
    Before = int(s.history['accuracy'][-1]*100)
    for i in range(2,Epoch_S+1):
        if  int(s.history['accuracy'][-i]*100)== Before:
            C=C+1
        else:
            C=1
        Before=int(s.history['accuracy'][-i]*100)
        print(Before)
    if C==Epoch_S:
        break
    s = siamese_net.fit([l, r], lbls, epochs = Epoch_S, shuffle=True, batch_size=128, callbacks=history)
print(j+1)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
82
82
82
82
81
81
81
81
81
81
80
79
78
73
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
83
83
83
83
83
83
83
83
83
83
83
83
83
82
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
85
84
84
84
84
84
84
84
84
84
84
84
84
84
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
85
85
85
85
85
85
85
85
85
85
85
85
84
85
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
E

Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
87
87
87
87
87
87
87
87
87
87
87
87
87
87
8


### Model evaluation

In [None]:
valid_ds = {}

for i, task in enumerate(sider_tasks):
    temp = []
    for j , data in enumerate(ds_study):
        smiles, embbed_drug, onehot_task, embbed_task, lbl, task_name = data
        if task ==  task_name:
            temp.append(data)

    valid_ds[task] = temp

In [None]:
task_scores = [sider_tasks for sider_tasks in range(len(sider_tasks))]

for i, task in enumerate(sider_tasks):

    l_val = []
    r_val = []
    lbls_valid = []
    for data in valid_ds[task]:

        smiles, embbed_drug, onehot_task, embbed_task, lbl, task_name = data
        l_val.append(embbed_drug[0])
        r_val.append(embbed_task)
        lbls_valid.append(lbl)

    l1 = np.array(l_val)
    r1 = np.array(r_val)
    lbls_valid = np.array(lbls_valid)

    y_pred = siamese_net.predict([l1,r1])

    result = (y_pred)
    task_scores[i] = task, result

In [None]:
for task in task_scores:
    print(" --------------------------------- ")
    print(F'{task[0]}:')
    for i, drug in enumerate(task[1]):
        print(F'{i+1}- {drug_name[i]}: {drug}')

 --------------------------------- 
Hepatobiliary disorders:
1- Vancomycin: [0.350539]
2- cyclosporine: [0.9355659]
3- fingolimod: [0.56871724]
4- interferon-beta 1a: [0.11978364]
5- Remdesivir: [0.828961]
6- Oseltamivir: [0.37015772]
7- Ritonavir : [0.98959655]
8- Molnupiravir : [0.4740463]
9- Paxlovid : [0.30573225]
10- Baxdrostat: [0.20183662]
11- Guanfacine : [0.202806]
12- Liraglutide: [0.10674441]
13- Linagliptin: [0.15892702]
 --------------------------------- 
Metabolism and nutrition disorders:
1- Vancomycin: [0.6682925]
2- cyclosporine: [0.9988003]
3- fingolimod: [0.68529713]
4- interferon-beta 1a: [0.44358477]
5- Remdesivir: [0.81775]
6- Oseltamivir: [0.6768574]
7- Ritonavir : [0.9990827]
8- Molnupiravir : [0.7079015]
9- Paxlovid : [0.7342775]
10- Baxdrostat: [0.4607547]
11- Guanfacine : [0.7755146]
12- Liraglutide: [0.2249364]
13- Linagliptin: [0.41168305]
 --------------------------------- 
Product issues:
1- Vancomycin: [0.11146241]
2- cyclosporine: [0.137191]
3- fingolim