In [1]:
import sys
import os

# Completely suppress stderr output
sys.stderr = open(os.devnull, 'w')

# Now import everything
import warnings
warnings.filterwarnings('ignore')

from pathlib import Path
sys.path.append('.')   # Add local directory to access some of the functions
sys.path.append('../../') # Make sure this is the location for the ART library

import warning_utils
warning_utils.filter_end_user_warnings()

In [2]:
import pandas as pd
import numpy as np

from art.core import RecommendationEngine
import art.utility as utils

import shap
import matplotlib.pyplot as plt

ModuleNotFoundError: No module named 'shap'

### Define directories

In [None]:
dataDir = '/code/DTRA_ART/DrugDesignData/'
modelBuildingDataDir = os.path.join(dataDir, 'modelBuildingData/')
resultsDir = os.path.join(dataDir, 'Results/')

### Extract the data for `Ebola` into a data frame with `duplicate` SMILES

In [None]:
ZaireEbolavirusData_chEMBL_wMACAW = pd.read_csv(modelBuildingDataDir + "ZaireEbolavirusData_chEMBL_wMACAW.csv")
ZaireEbolavirusData_chEMBL_wMACAW

In [None]:
ZaireEbolavirusData_chEMBL_wMACAW.columns

### Prepare data to run `ART` on `Ebola` data with `duplicate` SMILES

Find Features and Response

In [None]:
input_var = [col for col in ZaireEbolavirusData_chEMBL_wMACAW.columns if col.startswith('MACAW_')]
print(input_var)

In [None]:
features = ZaireEbolavirusData_chEMBL_wMACAW[input_var].to_numpy()

In [None]:
response_var = ["pPotency"]
print(response_var)

In [None]:
response = ZaireEbolavirusData_chEMBL_wMACAW[response_var].to_numpy()

And then save the data as a EDD style file (see "Importing a Study" here):

In [None]:
utils.save_edd_csv(features, response, input_var, modelBuildingDataDir + 'ZaireEbolavirusData_chEMBL_wMACAW_ARTready.csv', response_var)

### Predict response with ART

In [None]:
ZaireEbolavirusData_chEMBL_wMACAW_ARTready = pd.read_csv(modelBuildingDataDir + "ZaireEbolavirusData_chEMBL_wMACAW_ARTready.csv")
ZaireEbolavirusData_chEMBL_wMACAW_ARTready

Let's now define the ART parameters needed for the prediction of production

In [None]:
art_params = {
    'input_vars': input_var,
    'response_vars': response_var,
    'objective': 'maximize',
    'threshold': 0.2,
    'alpha': 0.5,
    'num_recommendations': 10,
    'max_mcmc_cores': 4,
    'seed': 42,                    # Uncomment this to avoid a random seed
    'output_dir': '.',
}

And then run ART without recommendations but with cross-validations to gauge how generalizable the results are:

In [None]:
ZaireEbolavirusData_chEMBL_wMACAW_ARTready.columns

In [None]:
%%time

art = RecommendationEngine(df=ZaireEbolavirusData_chEMBL_wMACAW_ARTready, **art_params,recommend=False,cross_val=True)

### Find SHAP values

We will now find which input features are more important by using SHAP analysis. First, lets initialize the library:

In [None]:
shap.initjs()

In [None]:
plt.style.use('default')

define a wrapper function that provides the ART prediction given and input X, for use by the SHAP library

In [None]:
def f(X):
    return art.predict(X)

convert the ART input data into the pandas dataframe that the SHAP library favors

In [None]:
X_df = pd.DataFrame(data=art.X, columns=art_params['input_vars'])

create and execute the explainer for the features values

In [None]:
explainer_e   = shap.Explainer(f, X_df)
shap_values_e = explainer_e(X_df)

In [None]:
shap.summary_plot(shap_values_e)

In [None]:
shap.plots.heatmap(shap_values_e, instance_order=shap_values_e.sum(1), max_display=15)


### Extract the data for `Ebola` into a data frame without `duplicate` SMILES

In [None]:
EbolaVirusData_chEMBL_noDuplicates = pd.read_csv(modelBuildingDataDir + "EbolaVirusData_chEMBL_wMACAW_noDuplicates_MLready.csv")
EbolaVirusData_chEMBL_noDuplicates

### Prepare to run `ART` on `Ebola` data without `duplicate` SMILES