In [1]:
import pandas as pd
import numpy as np
import deepchem as dc

import mlflow
import sys
sys.path.insert(0, '..')
from pathlib import Path
import src

In [2]:
# artifacts such as saved model weights, pickle files, etc
model_path = '../gnn_model/'
metadata_path = str(Path('..','metadata.txt'))
artifacts = {'model_files': model_path,
            'metadata': metadata_path}


In [3]:
# Serve as an MLflow wrapper for model
class ModelWrapper(mlflow.pyfunc.PythonModel):
    
    # Load in model and all required artifacts
    # context object is provided by mlflow
    # it contains all artifacts
    def load_context(self, context):
        import deepchem as dc
        import pickle
        import src
        
        # load model
        model = dc.models.GraphConvModel(12, model_dir=context.artifacts['model_files'], )
        model.restore()
        
        self.model = model
    
    # function which takes in pandas df and returns predicted labels 
    def predict(self, context, model_input, encode=True):
        from src import dc_utils
        from src import labels as le
        ds = dc_utils.df_to_dataset(model_input)
        #y_pred = self.model.predict(ds)
        #if encode:
        #    y_pred = np.argmax(y_pred, axis=2)
        #    y_pred = le.inverse_transform(y_pred)
        return self.model, ds
        #return model_fn.predict(data, self.model)

In [4]:
# mlflow complains if directory already exists, so remove it before saving model to mlflow directory
!rm -rf ./docker/mlflow_root/model
src_path = Path('../src')
mlflowpath= Path('docker','mlflow_root','model')

mlflow.pyfunc.save_model(path=str(mlflowpath), python_model=ModelWrapper(), 
                         artifacts=artifacts, conda_env='docker/dc_env_docker.yml', 
                         code_path=['../src/'])

In [5]:
# load model from mlflow directory
loaded_model = mlflow.pyfunc.load_model(str(mlflowpath))

In [6]:
tox21_tasks_2, tox21_datasets_2, transformers_2 = dc.molnet.load_tox21(featurizer='GraphConv')
train_dataset_2, valid_dataset_2, test_dataset_2 = tox21_datasets_2

In [7]:
df = src.dc_utils.dataset_to_df(valid_dataset_2)

In [8]:
m, ds = loaded_model.predict(df)

In [9]:
m



GraphConvModel(batch_normalize=None, batch_size=100, dense_layer_size=None,
               dropout=None, graph_conv_layers=None, mode='classification',
               n_classes=2, n_tasks=12, number_atom_features=None,
               uncertainty=False)

In [10]:
dir(src.dc_utils)

['__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'convmol_to_dict',
 'dataset_to_df',
 'dc',
 'df_to_dataset']

In [11]:
from src.dc_utils import dataset_to_df, df_to_dataset

In [12]:
valid_dataset_2.tasks

['NR-AR',
 'NR-AR-LBD',
 'NR-AhR',
 'NR-Aromatase',
 'NR-ER',
 'NR-ER-LBD',
 'NR-PPAR-gamma',
 'SR-ARE',
 'SR-ATAD5',
 'SR-HSE',
 'SR-MMP',
 'SR-p53']

In [26]:
test_ds = df_to_dataset(dataset_to_df(valid_dataset_2))

In [35]:
test_ds.to_dataframe()['X'][0].atom_features

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [49]:
valid_dataset_2

<DiskDataset X.shape: (783,), y.shape: (783, 12), w.shape: (783, 12), ids: ['N#C[C@@H]1CC(F)(F)CN1C(=O)CNC1CC2CCC(C1)N2c1ncccn1'
 'CN(C)C(=O)NC1(c2ccccc2)CCN(CCC[C@@]2(c3ccc(Cl)c(Cl)c3)CCCN(C(=O)c3ccccc3)C2)CC1'
 'CSc1nnc(C(C)(C)C)c(=O)n1N' ...
 'O=C(O[C@H]1CN2CCC1CC2)N1CCc2ccccc2[C@@H]1c1ccccc1'
 'C#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=CC(=O)CC[C@@H]4[C@H]3C(=C)C[C@@]21CC'
 'NC(=O)C(c1ccccc1)(c1ccccc1)[C@@H]1CCN(CCc2ccc3c(c2)CCO3)C1'], task_names: ['NR-AR' 'NR-AR-LBD' 'NR-AhR' ... 'SR-HSE' 'SR-MMP' 'SR-p53']>

In [77]:
dir(npds.X[0])

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_deg_sort',
 'agglomerate_mols',
 'atom_features',
 'canon_adj_list',
 'deg_adj_lists',
 'deg_block_indices',
 'deg_id_list',
 'deg_list',
 'deg_slice',
 'deg_start',
 'degree_list',
 'get_adjacency_list',
 'get_atom_features',
 'get_atoms_with_deg',
 'get_deg_adjacency_lists',
 'get_deg_slice',
 'get_null_mol',
 'get_num_atoms',
 'get_num_atoms_with_deg',
 'max_deg',
 'membership',
 'min_deg',
 'n_atoms',
 'n_feat']

In [55]:
m.predict(df_to_dataset(dataset_to_df(valid_dataset_2)))

AttributeError: 'numpy.ndarray' object has no attribute 'atom_features'