In [123]:
import tensorflow as tf
import pandas as pd
import numpy as np
import rdkit.Chem as Chem
import rdkit.Chem.AllChem as AllChem

from sklearn.model_selection import train_test_split

In [124]:
df = pd.read_csv('cmpd.csv')
count = 0

train = pd.DataFrame(columns=['inchikey', 'smiles', 'activity'])
test = pd.DataFrame(columns=['inchikey', 'smiles', 'activity'])

for idx in range(len(df)):
    if df.iloc[idx, -2] == 'train':
        train.loc[idx] = df.iloc[idx, [0, 1, 3]]
        count += 1
    else:
        test.loc[idx] = df.iloc[idx, [0, 1, 3]]
        count += 1

train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

In [125]:
X_train = train.iloc[:, :-1]
y_train = train.iloc[:, -1].to_list()
X_test = test.iloc[:, :-1]
y_test = test.iloc[:, -1].to_list()

In [126]:
def y_to_vec(list):
    for idx in range(len(list)):
        if list[idx] == 'active':
            list[idx] = 1
        else:
            list[idx] = 0
    return list

y_train = y_to_vec(y_train)
y_test = y_to_vec(y_test)

y_train = tf.keras.utils.to_categorical(y_train)
y_test = tf.keras.utils.to_categorical(y_test)

print(y_train)
print(y_test)

[[0. 1.]
 [0. 1.]
 [0. 1.]
 ...
 [1. 0.]
 [1. 0.]
 [1. 0.]]
[[0. 1.]
 [0. 1.]
 [0. 1.]
 ...
 [1. 0.]
 [1. 0.]
 [1. 0.]]


In [127]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.8, random_state=23)

print(len(X_train)) # 3181
print(len(X_val))   # 796
print(len(X_test))  # 1553

print(len(y_train)) # 3181
print(len(y_val))   # 796
print(len(y_test))  # 1553

3181
796
1553
3181
796
1553


In [138]:
def make_dataset(df):
    temp_list = []

    for idx in range(len(df)):
        temp_list.append(df.iloc[idx, :])
    
    tokenizer = tf.keras.preprocessing.text.Tokenizer(lower=False, char_level=True)
    tokenizer.fit_on_texts(temp_list)
    top_k = len(tokenizer.word_index)

    temp_seq = tokenizer.texts_to_sequences(temp_list)
    temp_vec = tf.keras.preprocessing.sequence.pad_sequences(temp_seq, padding='post')
    return temp_vec

X_train = make_dataset(X_train)
X_val = make_dataset(X_val)
X_test = make_dataset(X_test)
# X_train_inchikey = make_dataset(X_train, 0)
# X_train_smiles = make_dataset(X_train, 1)
# X_val_inchikey = make_dataset(X_val, 0)
# X_val_smiles = make_dataset(X_val, 1)
# X_test_inchikey = make_dataset(X_test, 0)
# X_test_smiles = make_dataset(X_test, 1)

In [139]:
print(X_train)
# print(X_train_smiles)
print(X_val)
# print(X_val_smiles)
print(X_test)
# print(X_test_smiles)

[[  24   25]
 [  26   27]
 [  28   29]
 ...
 [6329 6330]
 [6331 6332]
 [6333 6334]]
[[   1    2]
 [   3    4]
 [   5    6]
 ...
 [1587 1588]
 [1589 1590]
 [1591 1592]]
[[  19   20]
 [   2    3]
 [   4    5]
 ...
 [3082 3083]
 [3084 3085]
 [3086 3087]]


In [140]:
print(X_train.shape)
print(len(np.unique(X_train)))

(3181, 2)
6334


In [152]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Flatten, Activation, BatchNormalization, LSTM

In [156]:
model = Sequential()
model.add(Embedding(input_dim=len(np.unique(X_train)), output_dim=11, input_length=2))
model.add(LSTM(64))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Flatten())
model.add(Dense(16))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dense(32))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dense(2))
model.add(Activation('sigmoid'))

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics='acc'
)

model.fit(
    X_train, y_train,
    validation_data=[X_val, y_val],
    epochs=100
)

loss, acc = model.evaluate(
    X_val, y_val
)

print(loss, acc)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100

In [149]:

y_pred = model.predict(X_test)
print(y_pred[:5, 1])
print(y_test[:5])

[0.7311354  0.9257569  0.75772786 0.16909046 0.7906356 ]
[[0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]]
