In [82]:
# modules, labraries import
import tensorflow as tf
import pandas as pd
import numpy as np
import rdkit.Chem as Chem
import rdkit.Chem.AllChem as AllChem

In [83]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Flatten, \
    Activation, BatchNormalization, LSTM, Dropout
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.optimizers import Adam

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.preprocessing import MinMaxScaler

In [84]:
# make dataset
df = pd.read_csv('cmpd.csv')

# add to mol columns
df['mol'] = df.smiles.apply(Chem.MolFromSmiles)
df['mol'] = df.mol.apply(Chem.AddHs)

# add to number of atoms, number of heavy atoms columns
# separte inchikey based "-"
for idx in range(len(df)):
    temp_list = []
    temp_list.append(df.iloc[idx, 0].split("-"))
    df['num_of_atoms'] = df['mol'][idx].GetNumAtoms()
    df['num_of_heavy_atoms'] = df['mol'][idx].GetNumHeavyAtoms()
    df['inchikey_1'] = temp_list[0][0]
    df['inchikey_2'] = temp_list[0][1]

# sorted df columns
df = df[['inchikey', 'inchikey_1', 'inchikey_2', 'smiles', 'group', 'mol', 'num_of_atoms', 'num_of_heavy_atoms', 'activity']]
df

Unnamed: 0,inchikey,inchikey_1,inchikey_2,smiles,group,mol,num_of_atoms,num_of_heavy_atoms,activity
0,FNHKPVJBJVTLMP-UHFFFAOYSA-N,MKSAGABLDNGEAP,DHIUTWEWSA,CNC(=O)c1cc(Oc2ccc(NC(=O)Nc3ccc(Cl)c(C(F)(F)F)...,train,<rdkit.Chem.rdchem.Mol object at 0x000001F946B...,74,39,active
1,CUDVHEFYRIWYQD-UHFFFAOYSA-N,MKSAGABLDNGEAP,DHIUTWEWSA,CNC(=O)c1cccc2cc(Oc3ccnc4cc(OCC5(N)CC5)c(OC)cc...,train,<rdkit.Chem.rdchem.Mol object at 0x000001F952C...,74,39,active
2,TTZSNFLLYPYKIL-UHFFFAOYSA-N,MKSAGABLDNGEAP,DHIUTWEWSA,Cc1cc2cc(Oc3ccnc(Nc4cccc(CS(=O)(=O)NCCN(C)C)c4...,test,<rdkit.Chem.rdchem.Mol object at 0x000001F952C...,74,39,active
3,UOVCGJXDGOGOCZ-UHFFFAOYSA-N,MKSAGABLDNGEAP,DHIUTWEWSA,COc1cc2c(cc1F)C(c1ccccc1Cl)=Nc1c(n[nH]c1C)N2,train,<rdkit.Chem.rdchem.Mol object at 0x000001F952C...,74,39,active
4,CUIHSIWYWATEQL-UHFFFAOYSA-N,MKSAGABLDNGEAP,DHIUTWEWSA,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,test,<rdkit.Chem.rdchem.Mol object at 0x000001F952C...,74,39,active
...,...,...,...,...,...,...,...,...,...
5525,UBAHPEHGSJRHGA-UHFFFAOYSA-N,MKSAGABLDNGEAP,DHIUTWEWSA,Cc1cccc2nc(-c3ccc(-c4cccc(CN5CCC(C(N)=O)CC5)c4...,test,<rdkit.Chem.rdchem.Mol object at 0x000001F952D...,74,39,inactive
5526,RTTIKBHDHKOSNI-UHFFFAOYSA-N,MKSAGABLDNGEAP,DHIUTWEWSA,Cc1c2ccccc2nc2c1c1cc(NCCN(C)C)ccc1n2CCN(C)C,test,<rdkit.Chem.rdchem.Mol object at 0x000001F952D...,74,39,inactive
5527,HVUOSZANYULBJR-UHFFFAOYSA-N,MKSAGABLDNGEAP,DHIUTWEWSA,Cc1ccc(-c2ccn(-c3ccc4c5c(n(C)c4c3)CCCNC5)c(=O)...,test,<rdkit.Chem.rdchem.Mol object at 0x000001F952D...,74,39,inactive
5528,SNFWCJIVWUVRNO-UHFFFAOYSA-N,MKSAGABLDNGEAP,DHIUTWEWSA,N#Cc1c(-c2ccccc2C(F)(F)F)nc(SCc2ccc(OC(F)(F)F)...,test,<rdkit.Chem.rdchem.Mol object at 0x000001F952D...,74,39,inactive


In [85]:
# make train, test dataset formatted DataFrame
train = pd.DataFrame(columns=['inchikey', 'inchikey_1', 'inchikey_2', 'smiles', 'mol', 'num_of_atoms', 'num_of_heavy_atoms', 'activity'])
test = pd.DataFrame(columns=['inchikey', 'inchikey_1', 'inchikey_2', 'smiles', 'mol', 'num_of_atoms', 'num_of_heavy_atoms', 'activity'])

# add to values each columns
for idx in range(len(df)):
    if df.iloc[idx, 4] == 'train':
        train.loc[idx] = df.loc[idx, ['inchikey', 'inchikey_1', 'inchikey_2', 'smiles', 'mol', 'num_of_atoms', 'num_of_heavy_atoms', 'activity']]
    else:
        test.loc[idx] = df.loc[idx, ['inchikey', 'inchikey_1', 'inchikey_2', 'smiles', 'mol', 'num_of_atoms', 'num_of_heavy_atoms', 'activity']]

# reset index
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
train

Unnamed: 0,inchikey,inchikey_1,inchikey_2,smiles,mol,num_of_atoms,num_of_heavy_atoms,activity
0,FNHKPVJBJVTLMP-UHFFFAOYSA-N,MKSAGABLDNGEAP,DHIUTWEWSA,CNC(=O)c1cc(Oc2ccc(NC(=O)Nc3ccc(Cl)c(C(F)(F)F)...,<rdkit.Chem.rdchem.Mol object at 0x000001F946B...,74,39,active
1,CUDVHEFYRIWYQD-UHFFFAOYSA-N,MKSAGABLDNGEAP,DHIUTWEWSA,CNC(=O)c1cccc2cc(Oc3ccnc4cc(OCC5(N)CC5)c(OC)cc...,<rdkit.Chem.rdchem.Mol object at 0x000001F952C...,74,39,active
2,UOVCGJXDGOGOCZ-UHFFFAOYSA-N,MKSAGABLDNGEAP,DHIUTWEWSA,COc1cc2c(cc1F)C(c1ccccc1Cl)=Nc1c(n[nH]c1C)N2,<rdkit.Chem.rdchem.Mol object at 0x000001F952C...,74,39,active
3,IFPPYSWJNWHOLQ-UHFFFAOYSA-N,MKSAGABLDNGEAP,DHIUTWEWSA,CCN(CC)CCOc1ccc(Nc2ncc3cc(-c4c(Cl)cccc4Cl)c(=O...,<rdkit.Chem.rdchem.Mol object at 0x000001F952C...,74,39,active
4,WOSKHXYHFSIKNG-UHFFFAOYSA-N,MKSAGABLDNGEAP,DHIUTWEWSA,COc1cc2nccc(Oc3ccc(NC(=O)NC4CC4)c(Cl)c3)c2cc1C...,<rdkit.Chem.rdchem.Mol object at 0x000001F952C...,74,39,active
...,...,...,...,...,...,...,...,...
3972,INSBKYCYLCEBOD-UHFFFAOYSA-N,MKSAGABLDNGEAP,DHIUTWEWSA,O=C(O)Cc1c2ccccc2n2c1[nH]c(=O)c1ccccc12,<rdkit.Chem.rdchem.Mol object at 0x000001F952D...,74,39,active
3973,KTUFNOKKBVMGRW-UHFFFAOYSA-N,MKSAGABLDNGEAP,DHIUTWEWSA,Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1Nc1nccc...,<rdkit.Chem.rdchem.Mol object at 0x000001F952D...,74,39,inactive
3974,SVRAGOOKTLUHES-UHFFFAOYSA-N,MKSAGABLDNGEAP,DHIUTWEWSA,CN1CCC(n2cnc(-c3ccc(F)cc3)c2-c2ccnc(N)n2)CC1,<rdkit.Chem.rdchem.Mol object at 0x000001F952D...,74,39,unknown
3975,QQJUCFIPZAVTEU-UHFFFAOYSA-N,MKSAGABLDNGEAP,DHIUTWEWSA,CC1(C)CC(n2cnc(-c3ccc(F)cc3)c2-c2ccnc(N)n2)CC(...,<rdkit.Chem.rdchem.Mol object at 0x000001F952D...,74,39,unknown


In [86]:
# split X, y (y is based on 'activity' column)
X_train = train.iloc[:, :-1]
y_train = train.iloc[:, -1].to_list()
X_test = test.iloc[:, :-1]
y_test = test.iloc[:, -1].to_list()

In [87]:
# make y string -> vector
def y_to_vec(list):
    for idx in range(len(list)):
        if list[idx] == 'active':
            list[idx] = 1
        else:
            list[idx] = 0
    return list

y_train = y_to_vec(y_train)
y_test = y_to_vec(y_test)

# one hot encoding y
y_train = tf.keras.utils.to_categorical(y_train)
y_test = tf.keras.utils.to_categorical(y_test)

In [88]:
# split train, validation dataset
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.9, random_state=23)

In [89]:
# tokenized X data
def make_dataset(df):
    temp_list = []

    for idx in range(len(df)):
        temp_list.append(df.iloc[idx, :])
    
    tokenizer = tf.keras.preprocessing.text.Tokenizer(lower=False, char_level=True)
    tokenizer.fit_on_texts(temp_list)
    top_k = len(tokenizer.word_index)

    temp_seq = tokenizer.texts_to_sequences(temp_list)
    temp_vec = tf.keras.preprocessing.sequence.pad_sequences(temp_seq, padding='post')
    return temp_vec

X_train = make_dataset(X_train)
X_val = make_dataset(X_val)
X_test = make_dataset(X_test)

In [90]:
# data scaling used MinMaxScaler
# scaler = MinMaxScaler()

# def mms(scaler, data):
#     mms = scaler
#     mms.fit(data)
#     data = mms.transform(data)
#     return data

# X_train = mms(scaler, X_train)
# X_val = mms(scaler, X_val)
# X_test = mms(scaler, X_test)

In [91]:
X_train

array([[   35,     1,     2, ...,    37,     3,     4],
       [   38,     1,     2, ...,    40,     3,     4],
       [   41,     1,     2, ...,    43,     3,     4],
       ...,
       [10696,     1,     2, ..., 10698,     3,     4],
       [10699,     1,     2, ..., 10701,     3,     4],
       [10702,     1,     2, ..., 10704,     3,     4]])

In [92]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

print(len(np.unique(X_train)))

(3579, 7)
(398, 7)
(1553, 7)
10704


In [93]:
# modeling, used 'sigmoid' activation function in last layer
model = Sequential()
model.add(Embedding(input_dim=len(np.unique(X_train)), output_dim=10, input_length=7))
model.add(Flatten())
model.add(Dense(128))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dense(64))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dense(32))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dense(16))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dense(8))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dense(2))
model.add(Activation('sigmoid')) # binary classification (active(1), else(0))

model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 7, 10)             107040    
                                                                 
 flatten_4 (Flatten)         (None, 70)                0         
                                                                 
 dense_24 (Dense)            (None, 128)               9088      
                                                                 
 batch_normalization_20 (Bat  (None, 128)              512       
 chNormalization)                                                
                                                                 
 activation_24 (Activation)  (None, 128)               0         
                                                                 
 dense_25 (Dense)            (None, 64)                8256      
                                                      

In [94]:
rl = ReduceLROnPlateau(mode='auto', monitor='val_loss', patience=10)

In [95]:
# complie, fitting and save model
model.compile(
    optimizer=Adam(),
    loss='binary_crossentropy',
    metrics='acc'
)

model.fit(
    X_train, y_train,
    validation_data=[X_val, y_val],
    epochs=100, batch_size=64
)

model.save(filepath='./model.h5')

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [96]:
# evaluate, predict test dataset
loss, _ = model.evaluate(
    X_test, y_test
)

y_pred = model.predict(X_test)[:, 1]
for idx in range(len(y_pred)):
    if y_pred[idx] > 0.5:
        y_pred[idx] = 1.
    else:
        y_pred[idx] = 0.

y_test_2 = []
for idx in range(len(y_test)):
    y_test_2.append(y_test[idx, 1])



In [98]:
# evaluate used sklearn metrics
acc_score = accuracy_score(y_test_2, y_pred)
recall = recall_score(y_test_2, y_pred)
precision = precision_score(y_test_2, y_pred)
f1 = f1_score(y_test_2, y_pred)

print("loss : ", loss)
print("acc_score : ", acc_score)
print("recall : ", recall)
print("precision : ", precision)
print("f1 : ", f1)

loss :  2.215696096420288
acc_score :  0.5093367675466839
recall :  0.5448028673835126
precision :  0.37438423645320196
f1 :  0.4437956204379562
