In [50]:
import tensorflow as tf
import pandas as pd
import numpy as np
import rdkit.Chem as Chem
import rdkit.Chem.AllChem as AllChem

from sklearn.model_selection import train_test_split

In [51]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Flatten, \
    Activation, BatchNormalization, LSTM, Dropout

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [52]:
df = pd.read_csv('cmpd.csv')
count = 0

train = pd.DataFrame(columns=['inchikey', 'smiles', 'activity'])
test = pd.DataFrame(columns=['inchikey', 'smiles', 'activity'])

for idx in range(len(df)):
    if df.iloc[idx, -2] == 'train':
        train.loc[idx] = df.iloc[idx, [0, 1, 3]]
        count += 1
    else:
        test.loc[idx] = df.iloc[idx, [0, 1, 3]]
        count += 1

train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

In [53]:
X_train = train.iloc[:, :-1]
y_train = train.iloc[:, -1].to_list()
X_test = test.iloc[:, :-1]
y_test = test.iloc[:, -1].to_list()

In [54]:
def y_to_vec(list):
    for idx in range(len(list)):
        if list[idx] == 'active':
            list[idx] = 1
        else:
            list[idx] = 0
    return list

y_train = y_to_vec(y_train)
y_test = y_to_vec(y_test)

y_train = tf.keras.utils.to_categorical(y_train)
y_test = tf.keras.utils.to_categorical(y_test)

In [55]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.9, random_state=23)

print(len(X_train)) # 3181
print(len(X_val))   # 796
print(len(X_test))  # 1553

print(len(y_train)) # 3181
print(len(y_val))   # 796
print(len(y_test))  # 1553

3579
398
1553
3579
398
1553


In [56]:
def make_dataset(df):
    temp_list = []

    for idx in range(len(df)):
        temp_list.append(df.iloc[idx, :])
    
    tokenizer = tf.keras.preprocessing.text.Tokenizer(lower=False, char_level=True)
    tokenizer.fit_on_texts(temp_list)
    top_k = len(tokenizer.word_index)

    temp_seq = tokenizer.texts_to_sequences(temp_list)
    temp_vec = tf.keras.preprocessing.sequence.pad_sequences(temp_seq, padding='post')
    return temp_vec

X_train = make_dataset(X_train)
X_val = make_dataset(X_val)
X_test = make_dataset(X_test)

In [57]:
print(X_train.shape)
print(len(np.unique(X_train)))

(3579, 2)
7121


In [58]:
model = Sequential()
model.add(Embedding(input_dim=len(np.unique(X_train)), output_dim=11, input_length=2))
# model.add(LSTM(32))
# model.add(BatchNormalization())
# model.add(Activation('relu'))
model.add(Flatten())
model.add(Dense(16))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dense(32))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dense(16))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dense(8))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dense(2))
model.add(Activation('sigmoid'))

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics='acc'
)

model.fit(
    X_train, y_train,
    validation_data=[X_val, y_val],
    epochs=100, batch_size=64
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [70]:

loss, acc = model.evaluate(
    X_test, y_test
)


y_pred = model.predict(y_test)[:, 1]
for idx in range(len(y_pred)):
    if y_pred[idx] > 0.5:
        y_pred[idx] = 1.
    else:
        y_pred[idx] = 0.
_y_test = np.max(y_test, axis=1)

print(y_pred)
print(_y_test)

acc_score = accuracy_score(_y_test, y_pred)
recall = recall_score(_y_test, y_pred)
precision = precision_score(_y_test, y_pred)
f1 = f1_score(_y_test, y_pred)

print(loss, acc)
print(acc_score)
print(recall)
print(precision)
print(f1)
# log_loss = metrics.log_loss(np.max(y_test, axis=1), y_pred, labels=[0, 1])
# print(log_loss)

[1. 1. 1. ... 0. 0. 0.]
[1. 1. 1. ... 1. 1. 1.]
4.560804843902588 0.4236960709095001
0.35930457179652286
0.35930457179652286
1.0
0.5286594031264803


In [None]:
# precision, recall, _ = metrics.precision_recall_curve(np.max(y_test, axis=1), y_pred, pos_label=None)
# fpr_roc, tpr_roc, _ = metrics.roc_curve(np.max(y_test, axis=1), y_pred, pos_label=None)

# auc1 = metrics.auc(recall, precision)
# auc2 = metrics.auc(fpr_roc, tpr_roc)

# print(auc1)
# print(auc2)

1.0
nan




In [None]:
# train val 0.9 (BatchNormal)
# 5.406352996826172 0.5251256227493286

# train val 0.9 (Dropout)
# 31.515090942382812 0.5175879597663879

# train val 0.9 (BatchNormal + Dropout)
# 8.611763000488281 0.5150753855705261

# train val 0.9 (BatchNormal + adjust node)
# 3.570016622543335 0.5025125741958618