# Prediction Screening 

This notebook allows the user to load in their best performing machine learning model (under the variable best model). Then, using the load_from_model_file function from AMPL, the individual will load their model so it can be used to make predictions. From there, individuals can load SMILEs strings using the add_smiles function to read SMILEs strings into a dataframe and predict upon.

##### In case you run into issues when running this notebook using SLURM
-c for more cores, --mem for more memory, -t 1:00:00 for more time (can request up to 4 hours on Brown)

#### Importing Libraries 

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
sns.set_context("poster")
sns.set_style("whitegrid")
sns.set_palette("Set2")

import pandas as pd
import os, json, sys, glob, pickle

from atomsci.ddm.pipeline import model_pipeline as mp
from atomsci.ddm.pipeline import parameter_parser as parse
from atomsci.ddm.pipeline import perf_data

from atomsci.ddm.pipeline import predict_from_model as pfm
from atomsci.ddm.utils import curate_data
from atomsci.ddm.utils import struct_utils as su

import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem

from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole

2022-04-26 15:06:57.076067: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /apps/spack/brown/apps/proj/5.2.0-gcc-4.8.5-usib7od/lib:/apps/spack/brown/apps/geos/3.7.2-gcc-4.8.5-3vanyva/lib:/apps/spack/brown/apps/gdal/2.4.2-gcc-4.8.5-uj736h3/lib:/apps/spack/brown/apps/netcdf/4.5.0-gcc-6.3.0-d6fczmr/lib:/apps/spack/brown/apps/libtiff/4.0.10-gcc-6.3.0-6p5trqs/lib:/apps/spack/brown/apps/hdf5/1.8.16-gcc-6.3.0-7q7ndrz/lib:/apps/spack/brown/apps/hdf/4.2.14-gcc-6.3.0-2xg7pyg/lib:/scratch/brown/kamstut/tdm/apps/jupyter/kernels/llnl/.venv/lib:/apps/spack/brown/apps/r/4.0.0-gcc-6.3.0-hrvmcqp/rlib/R/lib:/apps/spack/brown/apps/openblas/0.3.7-gcc-6.3.0-qk24sho/lib:/apps/spack/brown/apps/tk/8.6.8-gcc-6.3.0-6qaesqb/lib:/apps/spack/brown/apps/tcl/8.6.8-gcc-6.3.0-n6mxabo/lib:/apps/spack/brown/apps/zlib/1.2.11-gcc-4.8.5-pkmj6e7/lib:/app

#### Defining the load_model_from_file function 

In [2]:
import tempfile,tarfile,os
def load_model_from_file(model_path) :
    reload_dir = tempfile.mkdtemp()
    model_fp = tarfile.open(model_path, mode='r:gz')
    model_fp.extractall(path=reload_dir)
    model_fp.close()
    # Open the model_metadata.json file containing the reloaded model parameters
    config_file_path = os.path.join(reload_dir, 'model_metadata.json')
    with open(config_file_path) as f:
        config = json.loads(f.read())
    # Set the transformer_key parameter to point to the transformer pickle file we just extracted
    try:
        has_transformers = config['model_parameters']['transformers']
        if has_transformers:
            config['model_parameters']['transformer_key'] = "%s/transformers.pkl" % reload_dir
    except KeyError:
        pass
    model_params = parse.wrapper(config)
    model_params.result_dir = tempfile.mkdtemp()
    model_params.featurizer = 'computed_descriptors'
    split_uuid = model_params.split_uuid
    dset_df = pd.read_csv(model_params.dataset_key)
    directory = os.path.dirname(model_params.dataset_key)
    dataset_name = os.path.splitext(os.path.basename(model_params.dataset_key))[0]
    if model_params.split_strategy == 'k_fold_cv':
        split_prefix = "%d_fold_cv_%s" % (model_params.num_folds, model_params.splitter)
    else:
        split_prefix = "train_valid_test_%s" % (model_params.splitter)
    split_path = os.path.join(directory, '%s_%s_%s.csv' % (dataset_name, split_prefix, split_uuid))
    split_df = pd.read_csv(split_path)

    pipe = mp.create_prediction_pipeline_from_file(model_params, reload_dir)
    return pipe,model_params,split_df

### Reading in best performing model

In [3]:
best_model = '/depot/tdm-atom/data/shared/model_depot/team1/OPRD1_OPRK1_OPRM1_data_with_smiles_model_82d3a3b5-0627-4aa4-aca7-07e51181b12a.tar.gz'

### Loading the best performing model

In [4]:
load_model_from_file(best_model)

2022-04-26 15:08:49,506 ['ampl_version', 'time_generated', 'best_epoch', 'time_built', 'dataset_hash', 'dataset_metadata', 'training_metrics'] are not part of the accepted list of parameters and will be ignored
2022-04-26 15:08:49,570 Created a dataset hash 'bb524d057c54ea478950d2a591bdb416' from dataset_key '/depot/tdm-atom/data/shared/model_depot/team1/OPRD1_OPRK1_OPRM1_data_with_smiles.csv'
2022-04-26 15:08:49,715 ['ampl_version', 'time_generated', 'best_epoch', 'time_built', 'dataset_hash', 'dataset_metadata', 'training_metrics'] are not part of the accepted list of parameters and will be ignored
2022-04-26 15:08:49,736 Created a dataset hash 'bb524d057c54ea478950d2a591bdb416' from dataset_key '/depot/tdm-atom/data/shared/model_depot/team1/OPRD1_OPRK1_OPRM1_data_with_smiles.csv'


num_model_tasks is deprecated and its value is ignored.
num_model_tasks is deprecated and its value is ignored.
Featurization = DynamicFeaturization with graphconv features


2022-04-26 15:08:49.801768: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /apps/spack/brown/apps/proj/5.2.0-gcc-4.8.5-usib7od/lib:/apps/spack/brown/apps/geos/3.7.2-gcc-4.8.5-3vanyva/lib:/apps/spack/brown/apps/gdal/2.4.2-gcc-4.8.5-uj736h3/lib:/apps/spack/brown/apps/netcdf/4.5.0-gcc-6.3.0-d6fczmr/lib:/apps/spack/brown/apps/libtiff/4.0.10-gcc-6.3.0-6p5trqs/lib:/apps/spack/brown/apps/hdf5/1.8.16-gcc-6.3.0-7q7ndrz/lib:/apps/spack/brown/apps/hdf/4.2.14-gcc-6.3.0-2xg7pyg/lib:/scratch/brown/kamstut/tdm/apps/jupyter/kernels/llnl/.venv/lib:/apps/spack/brown/apps/r/4.0.0-gcc-6.3.0-hrvmcqp/rlib/R/lib:/apps/spack/brown/apps/openblas/0.3.7-gcc-6.3.0-qk24sho/lib:/apps/spack/brown/apps/tk/8.6.8-gcc-6.3.0-6qaesqb/lib:/apps/spack/brown/apps/tcl/8.6.8-gcc-6.3.0-n6mxabo/lib:/apps/spack/brown/apps/zlib/1.2.11-gcc-4.8.5-pkmj6e7/lib:/apps/spack/br

(<atomsci.ddm.pipeline.model_pipeline.ModelPipeline at 0x2ab56a6d1640>,
 Namespace(bucket='public', dataset_key='/depot/tdm-atom/data/shared/model_depot/team1/OPRD1_OPRK1_OPRM1_data_with_smiles.csv', dataset_name=None, dataset_oid=None, datastore=False, id_col='compound_id', min_compound_number=200, response_cols=['target_OPRD1_standard_value', 'target_OPRK1_standard_value', 'target_OPRM1_standard_value'], save_results=False, smiles_col='base_rdkit_smiles', autoencoder_bucket=None, autoencoder_key=None, autoencoder_type='molvae', mol_vae_model_file=None, class_number=2, class_name=None, descriptor_bucket='public', descriptor_key=None, descriptor_oid=None, descriptor_spec_bucket='', descriptor_spec_key='/scratch/brown/kamstut/tdm/apps/jupyter/kernels/llnl/.venv/lib/python3.9/site-packages/atomsci/ddm/data/descriptor_sets_sources_by_descr_type.csv', descriptor_type='moe', moe_threads=-1, ecfp_radius=2, ecfp_size=1024, featurizer='computed_descriptors', model_choice_score_type='r2', model

### Reading in SMILEs Strings

First, we are creating an empty list called smiles

In [5]:
smiles = []

Next, we are creating a function called add_smiles.

This function will open each file it reads in the dataset directory (which you will define) and append the SMILEs strings to the smiles list

In [6]:
def add_smiles(dataset_directory):
    for filename in os.scandir(dataset_directory):
        with open(filename, "r") as ins:
            for line in ins:
                smiles.append(line.split("\n")[0])

Applying the add_smiles function

The function will find all of the files in this directory and append the SMILEs strings. 


The defined directories are where the SMILEs strings are currently located in Brown for Purdue University.

In [7]:
# Some of the paths are commented out because the llnl kernel will die when trying to read all of them in (there's almost 1 billion SMILEs strings!) 
# So, you can do a couple and later just concatenate the csv files or dataframes (depending on which path you take).

add_smiles("/depot/tdm-atom/data/allen99/S/H03")
#add_smiles("/depot/tdm-atom/data/allen99/S/H04")
#add_smiles("/depot/tdm-atom/data/allen99/S/H05")
#add_smiles("/depot/tdm-atom/data/allen99/S/H06")
#add_smiles("/depot/tdm-atom/data/allen99/S/H07")
#add_smiles("/depot/tdm-atom/data/allen99/S/H08")
#add_smiles("/depot/tdm-atom/data/allen99/S/H09")
#add_smiles("/depot/tdm-atom/data/allen99/S/H10")
#add_smiles("/depot/tdm-atom/data/allen99/S/H11")
#add_smiles("/depot/tdm-atom/data/allen99/S/H12")
#add_smiles("/depot/tdm-atom/data/allen99/S/H13")
#add_smiles("/depot/tdm-atom/data/allen99/S/H14")
#add_smiles("/depot/tdm-atom/data/allen99/S/H15")
#add_smiles("/depot/tdm-atom/data/allen99/S/H16")
#add_smiles("/depot/tdm-atom/data/allen99/S/H17")
#add_smiles("/depot/tdm-atom/data/allen99/S/H18")
#add_smiles("/depot/tdm-atom/data/allen99/S/H19")

Now, we are creating a dataframe from our list and looking at the shape and head of it.

In [8]:
smiles_df = pd.DataFrame(smiles, columns=['smiles_col'])
print(smiles_df.shape)
smiles_df.head()

(1, 1)


Unnamed: 0,smiles_col
0,CSC s_62____875850____876088


### Defining the predict_from_model_file function from AMPL

In [9]:
def predict_from_model_file(model_path, input_df, id_col='compound_id', smiles_col='smiles_col', response_col=None, is_featurized=False, dont_standardize=True):
    input_df, pred_params = pfm._prepare_input_data(input_df, id_col, smiles_col, response_col, dont_standardize)

    has_responses = ('response_cols' in pred_params)
    pred_params = parse.wrapper(pred_params)

    pipe = mp.create_prediction_pipeline_from_file(pred_params, reload_dir=None, model_path=model_path)
    if pipe.params.model_type == 'xgboost':
        pipe.params.uncertainty = False
    pred_df = pipe.predict_full_dataset(input_df, contains_responses=has_responses, is_featurized=is_featurized,
                                        dset_params=pred_params)
    pred_df = pred_df.sort_values(by=id_col)
    return pred_df

#### Predicting on the SMILES Strings in the smiles_df using predict_from_model_file

In [10]:
model_pred = pfm.predict_from_model_file(model_path= best_model,
                                         input_df=smiles_df, 
                                         id_col='compound_id', 
                                         smiles_col='smiles_col', 
                                         response_col=None, 
                                         dont_standardize=True, 
                                         is_featurized = False)

num_model_tasks is deprecated and its value is ignored.
Featurization = DynamicFeaturization with graphconv features
number of features: 75


2022-04-26 15:08:53.156877: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


ValueError: Length of values (3) does not match length of index (1)

### Looking at the returning dataframe from predicting

In [None]:
model_pred

### Setting a Path and Saving the Dataframe as a CSV File

In [29]:
path = r'/home/rwilfong/'

In [30]:
model_pred.to_csv(path+"opioid.csv")