In [None]:
import pandas as pd
import numpy as np
import pickle

from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM

seed = 42
np.random.seed(seed)
num_samples = 1000
lstm_size = 128
dropout = 0.2
rec_dropout = 0.2
epochs = 20
batch_size = 128
pad_length = 1000

In [None]:
with open("../models/tfidf.pickle", "rb") as fp:
    tfidf = pickle.load(fp)
    seq_vec_length = len(tfidf.vocabulary_)
with open("../models/PUC.pickle", "rb") as fp:
    PUC_models = pickle.load(fp)
with open("../models/sml_vec.pickle", "rb") as fp:
    sml_vec = pickle.load(fp)

In [None]:
df_ligands = pd.read_csv("ligands.csv", index_col="id", usecols=["id", "SMILES"])
df_sequences = pd.read_csv("sequences.csv", index_col="index")

In [None]:
df_total = pd.DataFrame()
for i, k in enumerate(PUC_models):
    df_temp = pd.DataFrame()
    bc_model = pickle.loads(PUC_models[k])
    smiles = df_ligands.loc[i, "SMILES"]
    if len(smiles) < pad_length:
        df_temp["lig_id"] = i
        df_temp["lig_SMILES"] = smiles
        df_temp["lig_vec"] = sml_vec(pad_length, smiles)
        df_temp["lig_vec"] = df_temp["lig_vec"].apply(lambda x: np.concatenate((x, np.zeros((pad_length - x.shape[0], 128)))))
        seq_samp = df_sequences.sample(n=num_samples, random_state=seed+i)
        df_temp["seq_id"] = seq_samp.index
        df_temp["seq_sequence"] = seq_samp["sequence"]
        df_temp["seq_vec"] = df_temp["seq_sequence"].apply(lambda x: tfidf.transform([x])[0])
        df_temp["pred_binding"] = df_temp["seq_vec"].apply(lambda x: np.repeat(bc_model.predict([x]), pad_length, axis=0))
        df_total = pd.concat([df_total, df_temp], ignore_index=True)
df_total["t_vec"] = 0
for i in df_total.index:
    lig_vec = df_total.loc[i,"lig_vec"]
    seq_vec = df_total.loc[i,"seq_vec"]
    df_total.loc[i, ["t_vec"]] = np.concatenate((lig_vec, seq_data), axis=1)

In [None]:
df_train = df_total.sample(frac=0.8, random_state=seed)
df_test  = df_total.loc[~df_total.index.isin(df_train.index)]

X_train = df_train["t_vec"].values
X_test = df_test["t_vec"].values
y_train = df_train["pred_binding"].values
y_test = df_test["pred_binding"].values

In [None]:
model = Sequential()
model.add(LSTM(lstm_size, input_shape=(max_features, 128+seq_vec_length), dropout=dropout, recurrent_dropout=rec_dropout))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test))

In [None]:
score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)