In [1]:
import pandas as pd
import numpy as np
import pickle

from keras.models import Model
from keras.layers import Input, Dense, Embedding, Concatenate
from keras.layers import LSTM

seed = 42
np.random.seed(seed)
num_samples = 100
lstm_size = 128
dropout = 0.2
rec_dropout = 0.2
epochs = 20
batch_size = 128
pad_length = 1000

Using TensorFlow backend.


In [2]:
def sml_vec(pad_length, smiles):
    vec = np.zeros((pad_length, 128))
    ind = [ord(x) for x in smiles]
    for i in range(len(ind)):
        vec[i, ind[i]] = 1.0
    return vec

In [3]:
with open("../models/tfidf.pickle", "rb") as fp:
    tfidf = pickle.load(fp)
    seq_vec_length = len(tfidf.vocabulary_)
with open("../models/PUC.pickle", "rb") as fp:
    PUC_models = pickle.load(fp)
for k in PUC_models:
    PUC_models[k] = pickle.loads(PUC_models[k])
print(seq_vec_length)

10000


In [4]:
df_ligands = pd.read_csv("../data/ligands.csv", index_col="id", usecols=["id", "SMILES"])
df_sequences = pd.read_csv("../data/sequences.csv", index_col=0)

In [5]:
df_total = pd.DataFrame()
for i, k in enumerate(PUC_models):
    print(i)
    df_temp = pd.DataFrame()
    bc_model = PUC_models[k]
    smiles = df_ligands.loc[i, "SMILES"]
    if len(smiles) < pad_length:
        seq_samp = df_sequences.sample(n=num_samples, random_state=seed+i)
        df_temp["seq_id"] = seq_samp.index.values
        df_temp["seq_sequence"] = seq_samp["sequence"].values
        df_temp["seq_vec"] = df_temp["seq_sequence"].apply(lambda x: tfidf.transform([x])[0].toarray()[0])
#         df_temp["seq_vec"] = df_temp["seq_vec"].apply(lambda x: np.repeat(x.reshape((1,-1)), pad_length, axis=0))
        df_temp["lig_id"] = i
        df_temp["lig_SMILES"] = smiles
        smlvec = sml_vec(pad_length, smiles)
        df_temp["lig_vec"] = 0
        df_temp["lig_vec"] = df_temp["lig_vec"].apply(lambda x: smlvec)
        df_temp["lig_vec"] = df_temp["lig_vec"].apply(lambda x: np.concatenate((x, np.zeros((pad_length - x.shape[0], 128)))))
        df_temp["pred_binding"] =df_temp["seq_vec"].apply(lambda x: bc_model.predict([x]))
        df_total = pd.concat([df_total, df_temp], ignore_index=True)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33


In [24]:
print(np.array([x for x in df_total["lig_vec"].values]))
print(np.array([x for x in df_total["seq_vec"].values]))

(3400, 1000, 128)
(3400, 10000)


In [None]:
df_total["t_vec"] = 0
for i in df_total.index:
    lig_vec = df_total.loc[i,"lig_vec"]
    seq_vec = df_total.loc[i,"seq_vec"]
    df_total.loc[i, ["t_vec"]] = np.concatenate((lig_vec, seq_vec), axis=1)

In [35]:
indices = df_total.index.values

df_test = df_total.loc[df_total.index.isin(indices[:num_samples*len(PUC_models)//5])]
df_fit = df_total.loc[df_total.index.isin(indices[num_samples*len(PUC_models)//5:])]
df_train = df_fit.sample(frac=0.8, random_state=seed)
df_val  = df_fit.loc[~df_fit.index.isin(df_train.index)]

X_train_a = np.array([x for x in df_train["lig_vec"].values])
X_train_b = np.array([x for x in df_train["seq_vec"].values])
X_val_a = np.array([x for x in df_val["lig_vec"].values])
X_val_b = np.array([x for x in df_val["seq_vec"].values])
X_test_a = np.array([x for x in df_test["lig_vec"].values])
X_test_b = np.array([x for x in df_test["seq_vec"].values])
y_train = df_train["pred_binding"].values
y_val = df_val["pred_binding"].values
y_test = df_val["pred_binding"].values

In [None]:
inp_a = Input(shape=(pad_length, 128,))
x_a = LSTM(lstm_size, dropout=dropout, recurrent_dropout=rec_dropout)(inp_a)
inp_b = Input(shape=(seq_vec_length,))
x = Concatenate()([x_a,inp_b])
x = Dense(1, activation='sigmoid')(x)
model = Model(inputs=[inp_a,inp_b], outputs=[x])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit([X_train_a,X_train_b], y_train, batch_size=batch_size, epochs=epochs, validation_data=([X_val_a,X_val_b], y_val), verbose=1)

In [None]:
score, acc = model.evaluate([X_test_a,X_test_b], y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)