# Use logBB, PARP, BCRP and MDR1 models to predict on Otava library

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os.path
sns.set_context("poster")
sns.set_style("whitegrid")
sns.set_palette("Set2")
pal = sns.color_palette()

import pandas as pd
import sys
import umap

from rdkit import Chem
from atomsci.ddm.pipeline import model_pipeline as mp
from atomsci.ddm.pipeline import parameter_parser as parse
import atomsci.ddm.utils.struct_utils as struct_utils
import atomsci.ddm.pipeline.model_tracker as mt
import atomsci.ddm.pipeline.chem_diversity as cd
import atomsci.ddm.pipeline.predict_from_model as pfm
import atomsci.ddm.pipeline.featurization as feat

#import warnings
#warnings.filterwarnings(action='once')

from sklearn.metrics import roc_curve, auc, roc_auc_score, r2_score, precision_recall_curve, average_precision_score, confusion_matrix

pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 90)

%config Completer.use_jedi = False

2021-12-23 08:44:18,454 Model tracker client not supported in your environment; will save models in filesystem only.


# load data

In [3]:
df=pd.read_csv("/usr/workspace/atom/PARP_compounds/Model_Predictions/TargetMol/targetmol.csv")
df.head(2)

Unnamed: 0,Index,Plate,Row,Col,Volume（microliter）,Concentration（mM）,CAS,compound_id,Name,Synonyms,SMILES,Formula,MolWt,Pathway,Target,Receptor,Bioactivity,Reference,base_rdkit_smiles,smilength
0,1,M29851,A,3,200,10,13103-34-9,T3137,Parenabol,Ba 29038;Boldenone Undecylenate;Vebonol,C=CCCCCCCCCC(=O)O[C@H]1CC[C@@H]2[C@]1(C)CC[C@H...,C30H44O3,452.67,Endocrinology/Hormones,Androgen Receptor agonist,Androgen Receptor,Boldenone undecylenate is a synthetic steroid.,"Oda SS, El-Ashmawy IM. Int J Exp Pathol. 2012 ...",C=CCCCCCCCCC(=O)O[C@H]1CC[C@H]2[C@@H]3CCC4=CC(...,75
1,2,M29851,A,4,200,10,76-49-3,T1246,Bornyl acetate,Isobornyl acetate;Bornyl acetic ether;2-Campha...,CC(=O)O[C@@H]1C[C@@H]2CC[C@@]1(C)C2(C)C.CC(=O)...,C12H20O2,196.29,Immunology/Inflammation,IL Receptor inhibitor,IL receptor,Bornyl acetate has an anti-inflammatory effect...,"Yang H, et al. IUBMB Life. 2014 Dec;66(12):854-9.",CC(=O)O[C@@H]1C[C@@H]2CC[C@@]1(C)C2(C)C,39


## Predict

In [4]:
bb_training_data_path = '/usr/workspace/atom/PARP_compounds/Datasets_and_Models/BBB/Final_Regresion_logBB.csv'
ki_training_data_path = '/usr/workspace/atom/PARP_compounds/Datasets_and_Models/PARP1/the_final_ki_data.csv'
xc_training_data_path = '/usr/workspace/atom/PARP_compounds/Datasets_and_Models/PARP1/the_final_xc50_data.csv'
pi_training_data_path = '/usr/workspace/atom/PARP_compounds/Datasets_and_Models/PGP/reg_in_cur_uni.csv'
bi_training_data_path = '/usr/workspace/atom/PARP_compounds/Datasets_and_Models/BCRP/intern_models_bad_data/BCRP_cla_in_final.csv'
pt_training_data_path = '/usr/workspace/atom/PARP_compounds/Datasets_and_Models/PGP/reg_tr_cur_uni2.csv'
bt_training_data_path = '/usr/workspace/atom/PARP_compounds/Datasets_and_Models/BCRP/intern_models_bad_data/BCRP_cla_tr_final.csv'
p2_training_data_path = '/usr/workspace/atom/PARP_compounds/Datasets_and_Models/PARP2/PARP2_cur.csv'

bb_path = "/usr/workspace/atom/PARP_compounds/Datasets_and_Models/BBB/Final_Regresion_logBB_model_69f30592-6ae7-44b3-a2a6-e84cf53a4d15.tar.gz"
ki_path = "/usr/workspace/atom/PARP_compounds/Datasets_and_Models/PARP1/the_final_ki_data_model_4545b415-b838-4c17-b183-3ba7d1b57648.tar.gz"
xc_path = "/usr/workspace/atom/PARP_compounds/Datasets_and_Models/PARP1/the_final_xc50_data_model_396cdb1d-1c54-4014-b84f-07ca5ed7f506.tar.gz"
pi_path = "/usr/workspace/atom/PARP_compounds/Datasets_and_Models/PGP/reg_in_cur_uni_model_1b15c0ba-2f9c-4697-a6c2-40391c096000.tar.gz"
bi_path = "/usr/workspace/atom/PARP_compounds/Datasets_and_Models/BCRP/intern_models_bad_data/BCRP_cla_in_final_model_d5a7d5ba-555e-42bd-ac19-87456397a791.tar.gz"
pt_path = "/usr/workspace/atom/PARP_compounds/Datasets_and_Models/PGP/reg_tr_cur_uni2_model_4bcd3b45-e006-48a5-9cd2-da7098551e65.tar.gz"
bt_path = "/usr/workspace/atom/PARP_compounds/Datasets_and_Models/BCRP/intern_models_bad_data/BCRP_cla_tr_final_model_d62b2e1a-bdc6-4de2-8e16-c7df2fa6e813.tar.gz"
p2_path = "/usr/workspace/atom/PARP_compounds/Datasets_and_Models/PARP2/cb99da89-a9c8-4db4-8f4a-e6688862a34e.tar.gz"

In [5]:
models=[
#     (bb_path, 'logBB',       'rdkit_raw',        bb_training_data_path), # works # NN regression
#     (ki_path, 'PARP1_Ki',    'ecfp',             ki_training_data_path), # works # RF regression
#     (xc_path, 'PARP1_XC50',   'ecfp',            xc_training_data_path), # works # XG regression
#     (p2_path, 'PARP2_inhib', 'rdkit_raw',        p2_training_data_path), # works # NN classification
#     (bi_path, 'BCRP_inhib',  'rdkit_raw',        bi_training_data_path), # works # NN classification
#     (pt_path, 'PGP_txpt',    'mordred_filtered', pt_training_data_path), # works # RF regression
#     (bt_path, 'BCRP_txpt',   'mordred_filtered', bt_training_data_path), # works # NN classification
    (pi_path, 'PGP_inhib',   'rdkit_raw',        pi_training_data_path), # doesn't work # RF regression
]

In [9]:
df=pd.read_csv("/usr/workspace/atom/PARP_compounds/Model_Predictions/TargetMol/targetmol.csv")

# df=df[df.MolWt<2000]

smiles=df.base_rdkit_smiles.tolist()
for modpath, modtype, feattype, extdata in models:
    print('\n', modtype, '\n')
    model_path = modpath
    if feattype != 'ecfp':
        feat_data_path = f"/usr/workspace//atom/PARP_compounds/Model_Predictions/TargetMol/scaled_descriptors/targetmol_with_{feattype}_descriptors.csv"
        feat_data = pd.read_csv(feat_data_path)
        feat_data = feat_data[feat_data.base_rdkit_smiles.isin(smiles)]
        is_featurized=True
    else:
        feat_data=df
        is_featurized=False
    input_df = feat_data
    if modtype=='logBB':
        id_col='Name'
        input_df=input_df.rename(columns={'compound_id':'Name'})
    elif modtype in ['PARP1_Ki', 'PARP1_XC50']:
        id_col='new_compound_id'
        input_df=input_df.rename(columns={'compound_id':'new_compound_id'})
    else:
        id_col = 'compound_id'
    smiles_col = 'base_rdkit_smiles'
    response_col = None
    dont_standardize = True
    AD_method = 'z_score'

    pred_df = pfm.predict_from_model_file(model_path = model_path, input_df=input_df,
                                          id_col=id_col, smiles_col=smiles_col, response_col=response_col,
                                dont_standardize=dont_standardize, is_featurized = is_featurized,
                                          AD_method=AD_method, external_training_data=extdata,
                                          # turn verbose on
                                          verbose=True
                                )

    pred_df.to_csv(f"./targetmol_pred_{modtype}.csv", index=False)


 PGP_inhib 



2021-12-23 08:58:11,567 ['ampl_version', 'time_generated', 'time_built', 'dataset_metadata', 'training_metrics'] are not part of the accepted list of parameters and will be ignored
2021-12-23 08:58:11,573 Reloading transformers from model tarball /tmp/tmp0xsp24te/transformers.pkl
2021-12-23 08:58:11,738 Converting SMILES to RDKit Mols


num_model_tasks is deprecated and its value is ignored.
Featurization = DescriptorFeaturization with rdkit_raw descriptors


2021-12-23 08:58:13,439 Formatting already featurized data...
2021-12-23 08:58:13,460 Done
2021-12-23 08:58:13,475 Transforming response data
2021-12-23 08:58:13,495 Transforming feature data
  X = np.nan_to_num((X - self.X_means) * X_weight / self.X_stds)
2021-12-23 08:58:13,531 Evaluating current model


number of features: 200


ValueError: Input contains NaN, infinity or a value too large for dtype('float32').