# Markov Chain Neural Network 
#### With Keras

#### Imports:

In [1]:
import numpy as np
import pandas as pd
import os
import shutil

from sklearn.model_selection import train_test_split

import molvs as mv
from rdkit import Chem
from rdkit.Chem import DataStructs, AllChem
from sklearn import metrics, model_selection
from sklearn.metrics import roc_auc_score, accuracy_score
import tensorflow.keras as keras
import tqdm

from keras.regularizers import l2
from keras.layers import Input, Dense, Dropout
from keras import Model, optimizers
from keras.callbacks import EarlyStopping, ModelCheckpoint

import warnings
warnings.filterwarnings('ignore')

## Data Imports

#### Morgan Fingerprint

In [2]:
cyp2c19_df = pd.read_pickle("./data/cyp_datasets/cyp2c19.pkl")
cyp2c9_df = pd.read_pickle("./data/cyp_datasets/cyp2c9.pkl")
cyp1a2_df = pd.read_pickle("./data/cyp_datasets/cyp1a2.pkl")
cyp2d6_df = pd.read_pickle("./data/cyp_datasets/cyp2d6.pkl")
cyp3a4_df = pd.read_pickle("./data/cyp_datasets/cyp3a4.pkl")

#### SwissADME Feature Set

In [3]:
cyp2c19_swiss_feat = pd.read_pickle('./data/cyp_datasets/cyp2c19_swiss_feat.pkl')
cyp2c9_swiss_feat = pd.read_pickle('./data/cyp_datasets/cyp2c9_swiss_feat.pkl')
cyp2d6_swiss_feat = pd.read_pickle('./data/cyp_datasets/cyp2d6_swiss_feat.pkl')
cyp1a2_swiss_feat = pd.read_pickle('./data/cyp_datasets/cyp1a2_swiss_feat.pkl')
cyp3a4_swiss_feat = pd.read_pickle('./data/cyp_datasets/cyp3a4_swiss_feat.pkl')

### Train, Test, Split with validation data

In [4]:
# Train, test, valid set split
def split_var(data_set):   
    X_data = np.array(data_set)
    y_data = np.array(data_set['Inhibition Observed'])
    X_train, X_test, y_train, y_test = model_selection.train_test_split(X_data, y_data, test_size = 0.1)
    X_train, X_valid, y_train, y_valid = model_selection.train_test_split(X_train, y_train, test_size = 0.11)
    return X_train, X_test, X_valid, y_train, y_test, y_valid

### Markov Chain Neural Network

In [5]:
def MC_NN(X_train, X_test, X_valid, y_train, y_test, y_valid, activation = None, hidden_nodes = 1024, dr = 0, sample_time = 300):
    
    X_train = np.array(X_train)
    X_valid = np.array(X_valid)
    X_test = np.array(X_test)
   
    inputShape=(X_train.shape[1] , )    
    inputs = Input(inputShape, name = 'Input')
    x = Dropout(dr)(inputs)
    x = Dense(hidden_nodes, activation = activation, kernel_regularizer = l2(1e-4))(x)
    x = Dropout(0.5)(x, training = True)
    y = Dense(1, activation = 'sigmoid', name = 'Output')(x)
 
    model = Model(inputs=inputs, outputs = y)
    model.compile(optimizer=optimizers.Adam(0.0001), loss='binary_crossentropy', metrics=['accuracy']) 
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=0)
    
    improvement_dir = 'Weights_improvement'
    if not os.path.exists(improvement_dir):
        os.mkdir(improvement_dir)
    filepath="%s/weights-improvement-{epoch:02d}-{val_loss:.2f}.hdf5" % improvement_dir
    checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=True, mode='auto')
    history = model.fit(X_train, y_train, batch_size= 200, epochs= 300, validation_data=(X_valid, y_valid), callbacks=[checkpoint, early_stopping], verbose=0)
    if os.path.exists(improvement_dir):
        shutil.rmtree(improvement_dir)

    mc_predictions = []
    for i in tqdm.tqdm(range(sample_time)):
        y_p = model.predict(X_test, batch_size=200)
        mc_predictions.append(y_p)
    y_std = np.std(np.array(mc_predictions), axis = 0)
    y_mean = np.mean(np.array(mc_predictions), axis = 0)

    y_pred = y_mean.copy()
    auc = roc_auc_score(y_test, y_pred)
    y_pred[y_pred >= 0.5] = 1
    y_pred[y_pred < 0.5] = 0
    acc = accuracy_score(y_test, y_pred)
    mc_pred = [mc_predictions,y_mean, y_std]
    print ('Dropout rate = %s, AUC: %.3f, ACC: %.3f' % (dr, auc, acc))
    return model, history, auc, acc, mc_pred

### Morgan Fingerprint MC_NN

#### CYP2c19

In [6]:
X = cyp2c19_df.drop(cyp2c19_df[['index', 'Inhibition Observed']], axis=1)
y = cyp2c19_df['Inhibition Observed']
X_train, X_test, X_valid, y_train, y_test, y_valid = split_var(cyp2c19_df)
cyp2c19_nn = MC_NN(X_train, X_test, X_valid, y_train, y_test, y_valid)

100%|██████████| 300/300 [00:14<00:00, 20.58it/s]

Dropout rate = 0, AUC: 0.478, ACC: 0.953





In [7]:
cyp2c19_nn = pd.DataFrame(cyp2c19_nn[2:5])
cyp2c19_nn.reset_index(inplace = True)
cyp2c19_nn.columns = ['index', 'Scores']
cyp2c19_nn.to_csv('./data/model_output/cyp2c19_nn.csv')

In [8]:
cyp2c19_nn.head()

Unnamed: 0,index,Scores
0,0,0.478211
1,1,0.953353
2,2,"[[[[1.], [1.], [1.], [1.], [1.], [2.813853e-35..."


#### CYP2c9

In [9]:
X = cyp2c9_df.drop(cyp2c9_df[['index', 'Inhibition Observed']], axis=1)
y = cyp2c9_df['Inhibition Observed']
X_train, X_test, X_valid, y_train, y_test, y_valid = split_var(cyp2c9_df)
cyp2c9_nn = MC_NN(X_train, X_test, X_valid, y_train, y_test, y_valid)

100%|██████████| 300/300 [00:14<00:00, 20.97it/s]

Dropout rate = 0, AUC: 0.460, ACC: 0.915





In [10]:
cyp2c9_nn = pd.DataFrame(cyp2c9_nn[2:5])
cyp2c9_nn.reset_index(inplace = True)
cyp2c9_nn.columns = ['index', 'Scores']
cyp2c9_nn.to_csv('./data/model_output/cyp2c9_nn.csv')

#### CYP2d6

In [11]:
X = cyp2d6_df.drop(cyp2d6_df[['index', 'Inhibition Observed']], axis=1)
y = cyp2d6_df['Inhibition Observed']
X_train, X_test, X_valid, y_train, y_test, y_valid = split_var(cyp2d6_df)
cyp2d6_nn = MC_NN(X_train, X_test, X_valid, y_train, y_test, y_valid)

100%|██████████| 300/300 [00:14<00:00, 20.47it/s]

Dropout rate = 0, AUC: 0.531, ACC: 0.924





In [12]:
cyp2d6_nn = pd.DataFrame(cyp2d6_nn[2:5])
cyp2d6_nn.reset_index(inplace = True)
cyp2d6_nn.columns = ['index', 'Scores']
cyp2d6_nn.to_csv('./data/model_output/cyp2d6_nn.csv')

#### CYP1a2

In [16]:
X = cyp1a2_df.drop(cyp1a2_df[['index', 'Inhibition Observed']], axis=1)
y = cyp1a2_df['Inhibition Observed']
X_train, X_test, X_valid, y_train, y_test, y_valid = split_var(cyp1a2_df)
cyp1a2_nn = MC_NN(X_train, X_test, X_valid, y_train, y_test, y_valid)

100%|██████████| 300/300 [00:15<00:00, 19.72it/s]

Dropout rate = 0, AUC: 0.524, ACC: 0.942





In [17]:
cyp1a2_nn = pd.DataFrame(cyp1a2_nn[2:5])
cyp1a2_nn.reset_index(inplace = True)
cyp1a2_nn.columns = ['index', 'Scores']
cyp1a2_nn.to_csv('./data/model_output/cyp1a2_nn.csv')

#### CYp3a4

In [18]:
X = cyp3a4_df.drop(cyp3a4_df[['index', 'Inhibition Observed']], axis=1)
y = cyp3a4_df['Inhibition Observed']
X_train, X_test, X_valid, y_train, y_test, y_valid = split_var(cyp3a4_df)
cyp3a4_nn = MC_NN(X_train, X_test, X_valid, y_train, y_test, y_valid)

100%|██████████| 300/300 [00:14<00:00, 20.21it/s]

Dropout rate = 0, AUC: 0.579, ACC: 0.945





In [19]:
cyp3a4_nn = pd.DataFrame(cyp3a4_nn[2:5])
cyp3a4_nn.reset_index(inplace = True)
cyp3a4_nn.columns = ['index', 'Scores']
cyp3a4_nn.to_csv('./data/model_output/cyp3a4_nn.csv')

### SwissADME Features MC_NN

#### CYP2c19

In [20]:
X = cyp2c19_swiss_feat.drop(cyp2c19_swiss_feat[['index', 'Inhibition Observed']], axis=1)
y = cyp2c19_swiss_feat['Inhibition Observed']
X_train, X_test, X_valid, y_train, y_test, y_valid = split_var(cyp2c9_df)
cyp2c19_sw_nn = MC_NN(X_train, X_test, X_valid, y_train, y_test, y_valid)

100%|██████████| 300/300 [00:15<00:00, 19.20it/s]

Dropout rate = 0, AUC: 0.422, ACC: 0.942





In [21]:
cyp2c19_sw_nn = pd.DataFrame(cyp2c19_sw_nn[2:5])
cyp2c19_sw_nn.reset_index(inplace = True)
cyp2c19_sw_nn.columns = ['index', 'Scores']
cyp2c19_sw_nn.to_csv('./data/model_output/cyp2c19_sw_nn.csv')

#### CYP2c9

In [25]:
X = cyp2c9_swiss_feat.drop(cyp2c9_swiss_feat[['index', 'Inhibition Observed']], axis=1)
y = cyp2c9_swiss_feat['Inhibition Observed']
X_train, X_test, X_valid, y_train, y_test, y_valid = split_var(cyp2c9_df)
cyp2c9_sw_nn = MC_NN(X_train, X_test, X_valid, y_train, y_test, y_valid)

100%|██████████| 300/300 [00:14<00:00, 20.39it/s]

Dropout rate = 0, AUC: 0.604, ACC: 0.939





In [28]:
cyp2c9_sw_nn = pd.DataFrame(cyp2c9_sw_nn[2:5])
cyp2c9_sw_nn.reset_index(inplace = True)
cyp2c9_sw_nn.columns = ['index', 'Scores']
cyp2c9_sw_nn.to_csv('./data/model_output/cyp2c9_sw_nn.csv')

#### CYP2d6

In [29]:
X = cyp2d6_swiss_feat.drop(cyp2d6_swiss_feat[['index', 'Inhibition Observed']], axis=1)
y = cyp2d6_swiss_feat['Inhibition Observed']
X_train, X_test, X_valid, y_train, y_test, y_valid = split_var(cyp2d6_df)
cyp2d6_sw_nn = MC_NN(X_train, X_test, X_valid, y_train, y_test, y_valid)

100%|██████████| 300/300 [00:13<00:00, 21.64it/s]

Dropout rate = 0, AUC: 0.556, ACC: 0.959





In [32]:
cyp2d6_sw_nn = pd.DataFrame(cyp2d6_sw_nn[2:5])
cyp2d6_sw_nn.reset_index(inplace = True)
cyp2d6_sw_nn.columns = ['index', 'Scores']
cyp2c9_sw_nn.to_csv('./data/model_output/cyp2d6_sw_nn.csv')

#### CYP1a2

In [33]:
X = cyp1a2_swiss_feat.drop(cyp1a2_swiss_feat[['index', 'Inhibition Observed']], axis=1)
y = cyp1a2_swiss_feat['Inhibition Observed']
X_train, X_test, X_valid, y_train, y_test, y_valid = split_var(cyp1a2_df)
cyp1a2_sw_nn = MC_NN(X_train, X_test, X_valid, y_train, y_test, y_valid)

100%|██████████| 300/300 [00:13<00:00, 21.91it/s]

Dropout rate = 0, AUC: 0.534, ACC: 0.945





In [34]:
cyp1a2_sw_nn = pd.DataFrame(cyp1a2_sw_nn[2:5])
cyp1a2_sw_nn.reset_index(inplace = True)
cyp1a2_sw_nn.columns = ['index', 'Scores']
cyp1a2_sw_nn.to_csv('./data/model_output/cyp1a2_sw_nn.csv')

#### CYp3a4

In [35]:
X = cyp3a4_swiss_feat.drop(cyp3a4_swiss_feat[['index', 'Inhibition Observed']], axis=1)
y = cyp3a4_swiss_feat['Inhibition Observed']
X_train, X_test, X_valid, y_train, y_test, y_valid = split_var(cyp3a4_df)
cyp3a4_sw_nn = MC_NN(X_train, X_test, X_valid, y_train, y_test, y_valid)

100%|██████████| 300/300 [00:15<00:00, 19.93it/s]

Dropout rate = 0, AUC: 0.485, ACC: 0.942





In [36]:
cyp3a4_sw_nn = pd.DataFrame(cyp3a4_sw_nn[2:5])
cyp3a4_sw_nn.reset_index(inplace = True)
cyp3a4_sw_nn.columns = ['index', 'Scores']
cyp2d6_sw_nn.to_csv('./data/model_output/cyp3a4_sw_nn.csv')

In [53]:
df_list = [cyp2c19_nn, cyp2c9_nn, cyp2d6_nn, cyp1a2_nn, cyp3a4_nn]
df1_list = [cyp2c19_sw_nn, cyp2c9_sw_nn, cyp2d6_sw_nn, cyp1a2_sw_nn, cyp3a4_sw_nn]

# Save concatonated results
cyp_nn_results = pd.concat(df_list)
cyp_sw_nn_results = pd.concat(df1_list)

# All Results together 
cyp_nn_results_ALL = [cyp_nn_results, cyp_sw_nn_results]
cyp_nn_results_ALL = pd.concat(cyp_nn_results_ALL)

In [55]:
cyp_nn_results.to_pickle('./data/model_output/cyp_nn_results.pkl')
cyp_sw_nn_results.to_pickle('./data/model_output/cyp_sw_nn_results.pkl')
cyp_nn_results_ALL.to_pickle('./data/model_output/cyp_nn_results_ALL.pkl')