# <div style="text-align: center"> BiocompoundML </div>

In [None]:
#BCML library
from bcml import bcml_module as bcml

In [None]:
# Clean training/testing folders between runs to prevent KeyError
bcml.clean_training_testing(training=True, testing=True)

### <mark>Required Input</mark> - Predictor Feature

In [None]:
# Predictor
# -- The desired feature to be predicted
__PREDICTOR = 'RON'

#### Universal Options

In [None]:
__OPTIONS = {}

# Network proxy (http://...)
__OPTIONS['proxy'] = ''

# HTTP request try count
# -- To prevent errors from failed connection attempts due to unstable network conditions
__OPTIONS['try_count'] = 5

# Random seed (for repeatable results)
# -- Must be a positive integer
__OPTIONS['random_seed'] = None

# Verbose mode
__OPTIONS['verbose'] = True

# Chunks (for retrieving PubChem info)
# -- Chunks allow PCP to split IDs into smaller sized chunks, which helps prevent problems
# -- with querying too many IDs
__OPTIONS['chunks'] = 10

# Plot
__OPTIONS['plot'] = False

In [None]:
# Model name (To save a new model or test an existing model)
__OPTIONS['model_name'] = 'RON_model_new_LW'

In [None]:
# Include user-provided features from PubChem
__OPTIONS['user'] = False

# Extract experimental features from PubChem
__OPTIONS['experimental'] = False

# Extract PaDEL-Descriptors from PubChem
__OPTIONS['chemofeatures'] = True

# Extract fingerprint features from PubChem
# -- Will default to True if one of distance, cluster, or impute is marked True
__OPTIONS['fingerprint'] = True

# Use SMILES rather than CIDs 
# -- Requires SMILES in input files
__OPTIONS['smiles'] = False

In [None]:
# Run clustering
__OPTIONS['cluster'] = False

# Create a distance matrix
# -- Will default to True if either cluster or impute are True
__OPTIONS['distance'] = True

# Impute missing values using K-NN imputation
__OPTIONS['impute'] = True

# Output data into numpy arrays
__OPTIONS['txt'] = False

# Output folder
import os
PATH = os.getcwd() #default 
__OPTIONS['outputdir'] = PATH

## Train a New/Existing Model
### <mark>Required Input</mark>

In [None]:
# Train a model?
__TRAIN = True

In [None]:
# Training input file
# -- A relative filepath pointing to the desired input file.
# ---- 1) A tab-delimited .txt file with compound name, pubchem ID, and chemical features
# ---- 2) A previously generated pickle file (.cluster .model or .features)

#####   WARNING !!!   #####
# -- The pickle datatype is inherently insecure. Pickle files can contain corrupt code and 
# -- executable commands that can contain malicious code. Make sure you trust the source of
# -- your model.
__TRAINING_INPUT = ''

#### Training Options

In [None]:
# Boruta feature selection
# -- Reduces uncharacterizing features
__OPTIONS['selection'] = True

# Split value (threshold for classification)
# -- If None, median value is used
__OPTIONS['split_value'] = None

# Error correct for potentially erroneous values in the training set
__OPTIONS['error_correct'] = True

# Cross-validate the model -- currently unavailable
__OPTIONS['cross_validate'] = False

# Insert sample weights
# -- Requires sample weights in the input file
__OPTIONS['weight'] = False

#### Start Training

In [None]:
# Initialize trained_model and training variables
trained_model = False
training = False

if __TRAIN:
    trained_model = bcml.train_model(__TRAINING_INPUT, __PREDICTOR, __OPTIONS)

### Test a New/Existing Model

### <mark>Required Input</mark>

In [None]:
# Test the model?
__TEST = True

In [None]:
# Test input (_TEST_INPUT_FILE and/or _TEST_INPUT_DIRECTORY required)

# Test Input File
# -- A tab-delimited .txt file with compound name and PubChem ID(if using file pubchem IDs are required)
__TEST_INPUT_FILE = None

# Test Input Directory
# -- A directory containing .sdf files (when PubChem IDs are unavailable)
__TEST_INPUT_DIRECTORY = ''

In [None]:
# Initialize testing, test, and prediction variables
from bcml import bcml_module as bcml
testing = True
test = True
prediction = True

if __TEST:
    [testing, test, prediction] = bcml.test_model(trained_model, __TEST_INPUT_FILE, 
                                                  __TEST_INPUT_DIRECTORY, __PREDICTOR, __OPTIONS)

#### Predict Features Using the Model

#  
#  
# <div style="text-align: center"> FeatureCreature </div>

In [None]:
__RUN_FEATURE_CREATURE = True
__FC_MODEL_NAME = __OPTIONS.get('model_name')

if not ((__TRAIN or __FC_MODEL_NAME) and (__TEST and __RUN_FEATURE_CREATURE)):
    # Stop running jupyter notebook
    # FeatureCreature requires compound characterization using BCML (__TEST), as well as
    # training data from BCML __TRAIN or a previously saved FeatureCreature model __FC_MODEL_NAME
    assert(False)

In [1]:
# Visualization libraries
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Image

# Explanability libraries
import lime
import lime.lime_tabular

# Chemistry libraries
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from IPython.display import display, Image

import imp
import os
import zipfile
from sys import platform
if platform == 'darwin':
    if os.path.isdir('./indigo-python-1.2.3.r0-mac') is False:
        zipref = zipfile.ZipFile('./indigo-python-1.2.3.r0-mac.zip', 'r')
        zipref.extractall('.')
    indigo = imp.load_source('indigo', 'indigo-python-1.2.3.r0-mac/indigo.py')
    indigo_renderer = imp.load_source('indigo_renderer', 'indigo-python-1.2.3.r0-mac/indigo_renderer.py')
elif platform == "linux" or platform == "linux2":
    if os.path.isdir('./indigopython130_linux') is False:
        zipref = zipfile.ZipFile('./indigopython130_linux.zip', 'r')
        zipref.extractall('.')
    indigo = imp.load_source('indigo', 'indigopython130_linux/indigo.py')
    indigo_renderer = imp.load_source('indigo_renderer', 'indigopython130_linux/indigo_renderer.py')
elif platform == "win32" or platform == "win64":
    if os.path.isdir('./indigopython130_win') is False:
        zipref = zipfile.ZipFile('./indigopython130_win.zip', 'r')
        zipref.extractall('.')
    indigo = imp.load_source('indigo', 'indigopython130_win/indigo.py')
    indigo_renderer = imp.load_source('indigopython130_win/indigo_renderer.py')

indigo = indigo.Indigo()
indigoRenderer = indigo_renderer.IndigoRenderer(indigo)

# Other libraries
import glob
import dill
import numpy as np
import sklearn
from collections import defaultdict

## Train FeatureCreature

#### Option - Save FeatureCreature Model

In [None]:
__SAVE_FEATURECREATURE = True
__BUILD_FEATURECREATURE_EXPLAINER = True

In [None]:
# Map feature IDs to .sdf patterns
feature_list = np.genfromtxt("feature_list.txt", dtype="str", delimiter="\t", comments="%")
feature_ids = [a for a, b in feature_list]
feature_patterns = [b for a, b in feature_list]

feature_dict = {feature_patterns[i]:feature_ids[i] for i in range(len(feature_ids))}

In [None]:
if not __TRAIN:
    trained_model = bcml.existing_training_model('pre-built_models/'+__FC_MODEL_NAME+ ".model", __OPTIONS)

In [None]:
explainer = False
if __BUILD_FEATURECREATURE_EXPLAINER:
    # Load training data and process into compound-feature matrix
    training_cpds = trained_model.input.compound
    cpd_names = list(training_cpds.keys())
    fc_features = list(trained_model.input.compound[cpd_names[0]]['padelhash'].keys())
    for feat in fc_features:
        if feat == 'Name':
            fc_features.remove(feat)
            break
    
    fc_training = np.zeros((len(training_cpds), len(fc_features)), dtype=np.float64)
    for index, value in np.ndenumerate(fc_training):
        compound = training_cpds[cpd_names[index[0]]]['padelhash']
        feature = fc_features[index[1]]
        fc_training[index] = float(compound[feature])
          
           
    # Remove invariable features
    reduced_X = fc_training[:,np.where(fc_training.var(axis=0)!=0)[0]]
    reduced_feature_ids = [feature_ids[i] for i in np.where(fc_training.var(axis=0)!=0)[0]]
    reduced_feature_patterns = [feature_patterns[i] for i in np.where(fc_training.var(axis=0)!=0)[0]]
    
    categorical_features = range(len(reduced_feature_ids))
    categorical_names = {}
    for feature in categorical_features:
        le = sklearn.preprocessing.LabelEncoder()
        le.fit(reduced_X[:,feature])
        categorical_names[feature] = le.classes_
    
    explainer = lime.lime_tabular.LimeTabularExplainer(reduced_X, verbose=True,
                                                       feature_names=reduced_feature_patterns,
                                                       class_names=['Low %s' % __PREDICTOR,'High %s' % __PREDICTOR],
                                                       categorical_features=categorical_features,
                                                       categorical_names=categorical_names, kernel_width=3)
    
       
    explainer.clf = sklearn.ensemble.RandomForestClassifier(n_estimators=512, 
                                                            oob_score=True, n_jobs=-1, 
                                                            class_weight="balanced")
    explainer.clf.fit(X=reduced_X, y=trained_model.predictors)
    explainer.feature_ids = reduced_feature_ids
    
    if __SAVE_FEATURECREATURE:
        filename = 'pre-built_models/'+__FC_MODEL_NAME + '.featurecreature'
        with open(filename, 'wb') as fid:
            dill.dump(explainer, fid)

## Predict Using FeatureCreature

In [None]:
if not explainer:
    filename = 'pre-built_models/'+__FC_MODEL_NAME + '.featurecreature'
    with open(filename, 'rb') as fid:
        explainer = dill.load(fid)

#### FeatureCreature Image Coloring Functions

In [None]:
# For each pattern, iterate atoms and map explainability score -
# results are cumulative, so negative and positive values obliterate
def getAtomsActivity (m, patterns):
    matcher = indigo.substructureMatcher(m)
    atom_values = defaultdict(float)
    for pattern, value in patterns:
        try:
            query = indigo.loadQueryMolecule(pattern)
            for match in matcher.iterateMatches(query):
                for qatom in query.iterateAtoms():
                    atom = match.mapAtom(qatom)
                    atom_values[atom.index()] += value / query.countAtoms()
        except:
            pass
    return atom_values

# Convert atom values to color scores: blue direction = negative, red direction = positive
def addColorSGroups (m, atom_values):
    min_value = min(atom_values.values())
    max_value = max(atom_values.values())
    centered_value = (min_value + max_value) / 2.
    for atom_index, atom_value in atom_values.items():
        if atom_value < 0.:
            color = "0, 0, %f" % abs(atom_value / centered_value)
        elif atom_value > 0.:
            color = "%f, 0, 0" % abs(atom_value / centered_value)
        m.addDataSGroup([atom_index], [], "color", color)
    return min_value, max_value

# Take mol file and pattern list and associate these patterns with the atoms
# and bonds and color them, based on the explainability results of LIME
def assignColorGroups (m, patterns):
    atom_values = getAtomsActivity(m, patterns)
    min_value, max_value = addColorSGroups(m, atom_values)
    return min_value, max_value

#### Process Test Data

In [None]:
# Link sdf files of test compounds
compounds = []
if __TEST and __TEST_INPUT_FILE:
    compounds += glob.glob('bcml/Chemoinformatics/db/testing/*.sdf')
if __TEST and __TEST_INPUT_DIRECTORY:
    compounds += glob.glob(__TEST_INPUT_DIRECTORY + '/*.sdf')

# Map PubChem IDs to compound names
if __TEST_INPUT_FILE:
    pubchem_id_dict = {}
    for i in range(len(testing.compounds)):
        pubchem_id_dict[testing.compounds[i]['PubChem']] = testing.compounds[i]['Name']


# Create a model using the predictions above
predict_fn = lambda x: explainer.clf.predict_proba(x).astype(float)
    

# Split value for displaying in PNG
if __OPTIONS.get('split_value') is not None:
    __SPLIT_VALUE = __OPTIONS.get('split_value')
else:
    __SPLIT_VALUE = np.median(trained_model.predictor_values)
    

# Load testing data
test_cpds = test.compounds
fc_feature_ids = explainer.feature_ids
fc_feature_patterns = explainer.feature_names

fc_test = np.zeros((len(test_cpds), len(fc_feature_ids)), dtype=np.float64)
for index, value in np.ndenumerate(fc_test):
    compound = test_cpds[index[0]]['padelhash']
    feature = fc_feature_ids[index[1]]
    fc_test[index] = float(compound[feature])

In [None]:
# Generate .png files of test compounds
import datetime
currentDT = datetime.datetime.now()
for row, filename in enumerate(compounds):
    id_name = filename.split('/')[-1][:-4]
    
    #Collect explanations from LIME
    exp = explainer.explain_instance(fc_test[row], predict_fn, num_features=len(fc_feature_patterns),
                                     top_labels=1, verbose=True, num_samples=5000)

    #Load molecule
    mol = indigo.iterateSDFile(filename)
    m = mol.at(0)
    patterns = []
    #Find the local explanation: exp.local_exp[1]
    intercept = list(exp.intercept.keys())[0]    
    local_prob = list(exp.intercept.values())[0]
    prob = exp.predict_proba[intercept]
    for k, v in exp.local_exp.items():
        for (num, val) in v:
        #Map the explanation to the feature, if it is present in the molecule move forward
            if float(exp.domain_mapper.feature_values[num]) == 1.:
                if abs(val) != 0.:
                    patterns.append((fc_feature_patterns[num],val))
        
    #Draw molecules
    indigo.setOption("render-atom-ids-visible", "false");
    indigo.setOption("render-atom-color-property", "color")
    indigo.setOption('render-coloring', False)
    indigo.setOption('render-comment-font-size', 32)
    indigo.setOption('render-bond-line-width', 2.0)
    indigo.setOption("render-margins", 100, 1);    
    try:
        indigo.setOption('render-comment', '%s (%.2f%% probability - %s > %.2f)' % (pubchem_id_dict[id_name],
                                                                   100*prediction[id_name],
                                                                   __PREDICTOR, __SPLIT_VALUE))
    except:
        indigo.setOption('render-comment', '%s (%.2f%% proabbility - %s > %.2f)' % (id_name, 
                                                                   100*prediction[id_name],
                                                                   __PREDICTOR, __SPLIT_VALUE))
    try:
        assignColorGroups(m, patterns)
    except:
        pass
    import os
    result_folder = bcml.check_results_folder(__OPTIONS.get('outputdir'), currentDT, __PREDICTOR)
    renderfile = result_folder + str(id_name) + ".png"
    indigoRenderer.renderToFile(m, renderfile)

In [None]:
print('All done! :)')