In [1]:
import sys
import os

# Completely suppress stderr output
sys.stderr = open(os.devnull, 'w')

# Now import everything
import warnings
warnings.filterwarnings('ignore')

from pathlib import Path
sys.path.append('.')   # Add local directory to access some of the functions
sys.path.append('../../') # Make sure this is the location for the ART library

import warning_utils
warning_utils.filter_end_user_warnings()

In [2]:
import pandas as pd
import numpy as np

from art.core import RecommendationEngine
import art.utility as utils

#import shap
import matplotlib.pyplot as plt

### Define directories

In [3]:
dataDir = '/code/DTRA_ART/DrugDesignData/'
modelBuildingDataDir = os.path.join(dataDir, 'modelBuildingData/')
resultsDir = os.path.join(dataDir, 'Results/')

### Extract the data for `Ebola` into a data frame with `duplicate` SMILES

In [4]:
ZaireEbolavirusData_chEMBL_wMACAW = pd.read_csv(modelBuildingDataDir + "ZaireEbolavirusData_chEMBL_wMACAW.csv")
ZaireEbolavirusData_chEMBL_wMACAW

Unnamed: 0,ID,compound_id,Smiles,pPotency,StrainClassifier,pPotency_category,MACAW_1,MACAW_2,MACAW_3,MACAW_4,...,MACAW_6,MACAW_7,MACAW_8,MACAW_9,MACAW_10,MACAW_11,MACAW_12,MACAW_13,MACAW_14,MACAW_15
0,1,CHEMBL221722,NC(=O)c1nc(F)cnc1O,4.173925,ZaireEbola,3-10,-0.597267,0.203766,0.080453,0.214558,...,0.115450,-0.216024,0.024546,-0.000042,-0.013658,-0.051793,0.042074,-0.001572,0.024070,-0.018438
1,2,CHEMBL221722,NC(=O)c1nc(F)cnc1O,4.173925,ZaireEbola,3-10,-0.597267,0.203766,0.080453,0.214558,...,0.115450,-0.216024,0.024546,-0.000042,-0.013658,-0.051793,0.042074,-0.001572,0.024070,-0.018438
2,3,CHEMBL221722,NC(=O)c1nc(F)cnc1O,4.619789,ZaireEbola,3-10,-0.597267,0.203766,0.080453,0.214558,...,0.115450,-0.216024,0.024546,-0.000042,-0.013658,-0.051793,0.042074,-0.001572,0.024070,-0.018438
3,4,CHEMBL221722,NC(=O)c1nc(F)cnc1O,4.619789,ZaireEbola,3-10,-0.597267,0.203766,0.080453,0.214558,...,0.115450,-0.216024,0.024546,-0.000042,-0.013658,-0.051793,0.042074,-0.001572,0.024070,-0.018438
4,5,CHEMBL5611967,O=C(Nc1cccc(Cl)c1)c1ccc(-c2cccc(OC3CCNCC3)c2)s1,5.730487,ZaireEbola,3-10,0.194000,0.271922,0.028860,0.008549,...,0.004466,-0.010283,0.023163,0.044987,0.174677,0.079320,0.070356,-0.031430,-0.032638,0.182068
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130,131,CHEMBL5398314,O=C(CCNCCC1CCN(Cc2ccccc2)CC1)Nn1c2ccccc2c2ccccc21,6.283997,ZaireEbola,3-10,0.002611,-0.102177,-0.327227,0.065414,...,0.093176,0.087306,-0.061251,-0.043842,0.074410,0.026748,0.084964,-0.171084,-0.026267,-0.080352
131,132,CHEMBL5398314,O=C(CCNCCC1CCN(Cc2ccccc2)CC1)Nn1c2ccccc2c2ccccc21,6.283997,ZaireEbola,3-10,0.002611,-0.102177,-0.327227,0.065414,...,0.093176,0.087306,-0.061251,-0.043842,0.074410,0.026748,0.084964,-0.171084,-0.026267,-0.080352
132,133,CHEMBL5424177,O=C(CNCCC1CCN(Cc2ccccc2)CC1)NN(c1ccccc1)c1ccccc1,5.721246,ZaireEbola,3-10,-0.027520,-0.131712,-0.412087,-0.021914,...,0.138736,0.089382,-0.111563,0.007607,0.086831,-0.007486,0.114860,-0.183397,-0.088811,-0.045560
133,134,CHEMBL5424177,O=C(CNCCC1CCN(Cc2ccccc2)CC1)NN(c1ccccc1)c1ccccc1,5.721246,ZaireEbola,3-10,-0.027520,-0.131712,-0.412087,-0.021914,...,0.138736,0.089382,-0.111563,0.007607,0.086831,-0.007486,0.114860,-0.183397,-0.088811,-0.045560


In [5]:
ZaireEbolavirusData_chEMBL_wMACAW.columns

Index(['ID', 'compound_id', 'Smiles', 'pPotency', 'StrainClassifier',
       'pPotency_category', 'MACAW_1', 'MACAW_2', 'MACAW_3', 'MACAW_4',
       'MACAW_5', 'MACAW_6', 'MACAW_7', 'MACAW_8', 'MACAW_9', 'MACAW_10',
       'MACAW_11', 'MACAW_12', 'MACAW_13', 'MACAW_14', 'MACAW_15'],
      dtype='object')

### Prepare data to run `ART` on `Ebola` data with `duplicate` SMILES

Find Features and Response

In [6]:
input_var = [col for col in ZaireEbolavirusData_chEMBL_wMACAW.columns if col.startswith('MACAW_')]
print(input_var)

['MACAW_1', 'MACAW_2', 'MACAW_3', 'MACAW_4', 'MACAW_5', 'MACAW_6', 'MACAW_7', 'MACAW_8', 'MACAW_9', 'MACAW_10', 'MACAW_11', 'MACAW_12', 'MACAW_13', 'MACAW_14', 'MACAW_15']


In [7]:
features = ZaireEbolavirusData_chEMBL_wMACAW[input_var].to_numpy()

In [8]:
response_var = ["pPotency"]
print(response_var)

['pPotency']


In [9]:
response = ZaireEbolavirusData_chEMBL_wMACAW[response_var].to_numpy()

And then save the data as a EDD style file (see "Importing a Study" here):

In [10]:
utils.save_edd_csv(features, response, input_var, modelBuildingDataDir + 'ZaireEbolavirusData_chEMBL_wMACAW_ARTready.csv', response_var)

### Predict response with ART

In [None]:
ZaireEbolavirusData_chEMBL_wMACAW_ARTready = pd.read_csv(modelBuildingDataDir + "ZaireEbolavirusData_chEMBL_wMACAW_ARTready.csv")
ZaireEbolavirusData_chEMBL_wMACAW_ARTready

Let's now define the ART parameters needed for the prediction of production

In [None]:
art_params = {
    'input_vars': input_var,
    'response_vars': response_var,
    'objective': 'maximize',
    'threshold': 0.2,
    'alpha': 0.5,
    'num_recommendations': 10,
    'max_mcmc_cores': 4,
    'seed': 42,                    # Uncomment this to avoid a random seed
    'output_dir': '.',
}

And then run ART without recommendations but with cross-validations to gauge how generalizable the results are:

In [None]:
ZaireEbolavirusData_chEMBL_wMACAW_ARTready.columns

In [None]:
%%time

art = RecommendationEngine(df=ZaireEbolavirusData_chEMBL_wMACAW_ARTready, **art_params,recommend=False,cross_val=True)

### Find SHAP values

We will now find which input features are more important by using SHAP analysis. First, lets initialize the library:

In [None]:
shap.initjs()

In [None]:
plt.style.use('default')

define a wrapper function that provides the ART prediction given and input X, for use by the SHAP library

In [None]:
def f(X):
    return art.predict(X)

convert the ART input data into the pandas dataframe that the SHAP library favors

In [None]:
X_df = pd.DataFrame(data=art.X, columns=art_params['input_vars'])

create and execute the explainer for the features values

In [None]:
explainer_e   = shap.Explainer(f, X_df)
shap_values_e = explainer_e(X_df)

In [None]:
shap.summary_plot(shap_values_e)

In [None]:
shap.plots.heatmap(shap_values_e, instance_order=shap_values_e.sum(1), max_display=15)


### Extract the data for `Ebola` into a data frame without `duplicate` SMILES

In [None]:
EbolaVirusData_chEMBL_noDuplicates = pd.read_csv(modelBuildingDataDir + "EbolaVirusData_chEMBL_wMACAW_noDuplicates_MLready.csv")
EbolaVirusData_chEMBL_noDuplicates

### Prepare to run `ART` on `Ebola` data without `duplicate` SMILES