In [1]:
import pickle
import pandas as pd
import numpy as np
import deepchem as dc

import mlflow
import sys
sys.path.insert(0, '..')
from pathlib import Path
import src
import json 

In [2]:
# artifacts such as saved model weights, pickle files, etc
model_path = '../gnn_model/'
metadata_path = str(Path('..','metadata.txt'))
artifacts = {'model_files': model_path,
            'metadata': metadata_path}


In [3]:
# Serve as an MLflow wrapper for model
class ModelWrapper(mlflow.pyfunc.PythonModel):
    
    # Load in model and all required artifacts
    # context object is provided by mlflow
    # it contains all artifacts
    def load_context(self, context):
        import deepchem as dc
        import pickle
        import src
        
        # load model
        model = dc.models.GraphConvModel(12, model_dir=context.artifacts['model_files'], )
        model.restore()
        
        self.model = model
        
        tox21_tasks_2, tox21_datasets_2, transformers_2 = dc.molnet.load_tox21(featurizer='GraphConv')
        featurizer = dc.feat.ConvMolFeaturizer()
        
        self.featurizer = featurizer
        self.tasks = tox21_tasks_2
        
    
    # function which takes in pandas df and returns predicted labels 
    def predict(self, context, model_input):
        from src import dc_utils
        from src import labels as le
        import deepchem as dc
        
        if 'molecules' in model_input.columns:
            molecules = list(model_input['molecules'])
            conv_mols = self.featurizer(molecules)
            ds = dc.data.NumpyDataset(conv_mols, ids=molecules, n_tasks=12)
            ds.tasks = self.tasks
        
        else:
            ds = dc_utils.df_to_dataset(model_input)
        
        y_pred = self.model.predict(ds)
        y_pred = le.inverse_transform(np.argmax(y_pred, axis=2))
        return y_pred

    

In [4]:
# mlflow complains if directory already exists, so remove it before saving model to mlflow directory
!rm -rf ./docker/api/mlflow_root_final/model
src_path = Path('../src')
mlflowpath= Path('docker','api','mlflow_root_final','model')

mlflow.pyfunc.save_model(path=str(mlflowpath), python_model=ModelWrapper(), 
                         artifacts=artifacts, conda_env='docker/dc_env_docker.yml', 
                         code_path=['../src/'])

In [5]:
# load model from mlflow directory
loaded_model = mlflow.pyfunc.load_model(str(mlflowpath))

In [6]:
tox21_tasks_2, tox21_datasets_2, transformers_2 = dc.molnet.load_tox21(featurizer='GraphConv')
train_dataset_2, valid_dataset_2, test_dataset_2 = tox21_datasets_2
print('Number of samples: {}'.format(test_dataset_2.X.shape))

Number of samples: (784,)


In [7]:
from src.dc_utils import data_to_json, json_dict_to_dict, dict_to_dataframe, format_request
test_data = data_to_json(test_dataset_2)
req = format_request(test_data)
dd = json_dict_to_dict(json.loads(req)['data']['df'])
df = dict_to_dataframe(dd)
pred = loaded_model.predict(df)
len(pred)

784

In [8]:
## the incoming JSON data will look like this
dd = src.dc_utils.json_dict({'data': src.dc_utils.dataset_to_dict(test_dataset_2)})

In [9]:
d = src.dc_utils.json_dict_to_dict(dd['data'])

In [10]:
df = src.dc_utils.dict_to_dataframe(d['df'])

In [11]:
p = loaded_model.predict(df)

In [12]:
molecules = ['CC(C)(c1ccc(Oc2ccc3c(c2)C(=O)OC3=O)cc1)c1ccc(Oc2ccc3c(c2)C(=O)OC3=O)cc1',
       'Cc1cc(C(C)(C)C)c(O)c(C)c1Cn1c(=O)n(Cc2c(C)cc(C(C)(C)C)c(O)c2C)c(=O)n(Cc2c(C)cc(C(C)(C)C)c(O)c2C)c1=O',
       'Cc1nnc(-c2ccccc2)c(=O)n1N', 'N=C(N)NCC1COc2ccccc2O1',
       'Cc1cccc(C)c1NC(=O)NC1=CCCN1C', 'c1csc(C2(N3CCCCC3)CCCCC2)c1',]

tox21_tasks_2, tox21_datasets_2, transformers_2 = dc.molnet.load_tox21(featurizer='GraphConv')
featurizer = dc.feat.ConvMolFeaturizer()
conv_mols = featurizer(molecules)
dataset = dc.data.NumpyDataset(conv_mols, ids=molecules, n_tasks=12)
dataset.tasks = tox21_tasks_2

In [13]:
x = pd.DataFrame({'molecules' : molecules})

In [14]:
p2 = loaded_model.predict(x)

In [15]:
p[:6]

[['estrogen receptor alpha, LBD (ER, LBD): inactive',
  'estrogen receptor alpha, full (ER, full): inactive',
  'aromatase: inactive',
  'aryl hydrocarbon receptor (AhR): inactive',
  'androgen receptor, full (AR, full): inactive',
  'androgen receptor, LBD (AR, LBD): inactive',
  'peroxisome proliferator-activated receptor gamma (PPAR-gamma): inactive',
  'nuclear factor (erythroid-derived 2)-like 2/antioxidant responsive element (Nrf2/ARE): inactive',
  'heat shock factor response element (HSE): inactive',
  'ATAD5: inactive',
  'mitochondrial membrane potential (MMP): inactive',
  'p53: inactive'],
 ['estrogen receptor alpha, LBD (ER, LBD): active',
  'estrogen receptor alpha, full (ER, full): inactive',
  'aromatase: active',
  'aryl hydrocarbon receptor (AhR): active',
  'androgen receptor, full (AR, full): active',
  'androgen receptor, LBD (AR, LBD): active',
  'peroxisome proliferator-activated receptor gamma (PPAR-gamma): active',
  'nuclear factor (erythroid-derived 2)-like 2