In [1]:
import pdb
import logging
import os
import sys
import deepchem as dc
import numpy as np
import pandas as pd
from deepchem.data import DiskDataset
import collections
import atomsci.ddm.pipeline.splitting as split

  from numpy.core.umath_tests import inner1d


In [2]:
# Example set of molecules to use as dataset to learn Splitting

mols = [
    'C1=CC2=C(C=C1)C1=CC=CC=C21', 'O=C1C=CC(=O)C2=C1OC=CO2', 'C1=C[N]C=C1',
    'C1=CC=CC=C[C+]1', 'C1=[C]NC=C1', 'N[C@@H](C)C(=O)O', 'N[C@H](C)C(=O)O',
    'CC', 'O=C=O', 'C#N', 'CCN(CC)CC', 'CC(=O)O', 'C1CCCCC1', 'c1ccccc1'
]
print("Original set of molecules")
print(mols)

Original set of molecules
['C1=CC2=C(C=C1)C1=CC=CC=C21', 'O=C1C=CC(=O)C2=C1OC=CO2', 'C1=C[N]C=C1', 'C1=CC=CC=C[C+]1', 'C1=[C]NC=C1', 'N[C@@H](C)C(=O)O', 'N[C@H](C)C(=O)O', 'CC', 'O=C=O', 'C#N', 'CCN(CC)CC', 'CC(=O)O', 'C1CCCCC1', 'c1ccccc1']


In [3]:
# Learn using the Random Splitter

dataset = dc.data.NumpyDataset(X=mols, ids=mols)
splitter = dc.splits.RandomSplitter()
train, valid, test = splitter.train_valid_test_split(dataset)
# The return values are dc.data.Dataset objects so we need to extract
# the ids
print("Training set")
print(train.ids)
print("Valid set")
print(valid.ids)
print("Test set")
print(test.ids)

Training set
['C1=[C]NC=C1' 'CCN(CC)CC' 'C1=C[N]C=C1' 'C1=CC2=C(C=C1)C1=CC=CC=C21'
 'C#N' 'c1ccccc1' 'O=C=O' 'CC(=O)O' 'C1=CC=CC=C[C+]1' 'C1CCCCC1' 'CC']
Valid set
['O=C1C=CC(=O)C2=C1OC=CO2']
Test set
['N[C@H](C)C(=O)O' 'N[C@@H](C)C(=O)O']


In [6]:
# Learn using the Scaffold Splitter

dataset = dc.data.NumpyDataset(X=mols, ids=mols)
splitter = dc.splits.ScaffoldSplitter()
train, valid, test = splitter.train_valid_test_split(dataset)
# The return values are dc.data.Dataset objects so we need to extract
# the ids
print("Training set")
print(train.ids)
print("Valid set")
print(valid.ids)
print("Test set")
print(test.ids)

Training set
['N[C@@H](C)C(=O)O' 'N[C@H](C)C(=O)O' 'CC' 'O=C=O' 'C#N' 'CCN(CC)CC'
 'CC(=O)O' 'c1ccccc1' 'C1CCCCC1' 'C1=[C]NC=C1' 'C1=CC=CC=C[C+]1']
Valid set
['C1=C[N]C=C1']
Test set
['O=C1C=CC(=O)C2=C1OC=CO2' 'C1=CC2=C(C=C1)C1=CC=CC=C21']


In [7]:
from atomsci.ddm.pipeline import model_pipeline as mp
from atomsci.ddm.pipeline import parameter_parser as parse



In [21]:
# Scaffold Split Example (DO NOT RUN DURING CLASS)
#dataset_split 
params = {
    "script_dir": "/usr/workspace/hiran/AMPL/atomsci/ddm",
    "python_path": "/usr/mic/bio/anaconda3/bin/python",
    "dataset_key" : "/usr/workspace/atom/public_dsets/DTC/ml_ready/kcnh2_dtc_base_smiles_all.csv",
    "datastore": "False",
    "split_only": "True",
    "splitter": "scaffold",
    "split_valid_frac": "0.15",
    "split_test_frac": "0.15",
    "previously_split": "False",
    "prediction_type": "regression",
    "response_cols" : "PIC50",
    "id_col": "compound_id",
    "smiles_col" : "base_rdkit_smiles",
    "result_dir": "/p/lustre3/hiran/atomcode/results_dir/kcnh2_models_project_class",
    "system": "LC",
    "transformers": "True",
    "model_type": "NN",
    "featurizer": "graphconv",
    "descriptor_type": "graphconv",
    "learning_rate": "0.0007",
    "layer_sizes": "64,64,32",
    "dropouts" : "0.0,0.0,0.0",
    "save_results": "False",
    "max_epochs": "100",
    "verbose": "True"
}
pparams = parse.wrapper(params)
MP = mp.ModelPipeline(pparams)
MP.train_model()

number of features: 75


2021-02-24 14:24:26,088 Splitting data by scaffold
2021-02-24 14:25:00,436 Dataset split table saved to /usr/workspace/atom/public_dsets/DTC/ml_ready/kcnh2_dtc_base_smiles_all_train_valid_test_scaffold_0cab25e3-954f-4cc1-9eba-deb365468ecd.csv


In [8]:
split_dataset = pd.read_csv("/usr/workspace/atom/public_dsets/DTC/ml_ready/kcnh2_dtc_base_smiles_all_train_valid_test_scaffold_0cab25e3-954f-4cc1-9eba-deb365468ecd.csv")

In [9]:
split_dataset.pivot_table(index=['subset'], aggfunc='size')

subset
test      898
train    4189
valid     898
dtype: int64

In [52]:
# Random Split Example (DO NOT RUN DURING CLASS) 
#dataset_split
params = {
    "script_dir": "/usr/workspace/hiran/AMPL/atomsci/ddm",
    "python_path": "/usr/mic/bio/anaconda3/bin/python",
    "dataset_key" : "/usr/workspace/atom/public_dsets/DTC/ml_ready/kcnh2_dtc_base_smiles_all.csv",
    "datastore": "False",
    "split_only": "True",
    "splitter": "random",
    "split_valid_frac": "0.15",
    "split_test_frac": "0.15",
    "previously_split": "False",
    "prediction_type": "regression",
    "response_cols" : "PIC50",
    "id_col": "compound_id",
    "smiles_col" : "base_rdkit_smiles",
    "result_dir": "/p/lustre3/hiran/atomcode/results_dir/kcnh2_models_project_class_randomsplit",
    "system": "LC",
    "transformers": "True",
    "model_type": "NN",
    "featurizer": "graphconv",
    "descriptor_type": "graphconv",
    "learning_rate": "0.0007",
    "layer_sizes": "64,64,32",
    "dropouts" : "0.0,0.0,0.0",
    "save_results": "False",
    "max_epochs": "100",
    "verbose": "True"
}
pparams = parse.wrapper(params)
MP = mp.ModelPipeline(pparams)
MP.train_model()

number of features: 75


2021-02-24 14:44:28,766 Splitting data by random
2021-02-24 14:44:39,683 Dataset split table saved to /usr/workspace/atom/public_dsets/DTC/ml_ready/kcnh2_dtc_base_smiles_all_train_valid_test_random_ede8e1bb-07a0-4306-92af-0262c7f03179.csv


In [10]:
split_dataset_randomsplit = pd.read_csv("/usr/workspace/atom/public_dsets/DTC/ml_ready/kcnh2_dtc_base_smiles_all_train_valid_test_random_ede8e1bb-07a0-4306-92af-0262c7f03179.csv")

In [11]:
split_dataset_randomsplit.pivot_table(index=['subset'], aggfunc='size')

subset
test      898
train    4189
valid     898
dtype: int64

In [12]:
#Dataset Split using Random Split
split_dataset_randomsplit

Unnamed: 0,cmpd_id,fold,subset
0,OPEHXHIHHLIHNF-LZHZRURDSA-N,0,train
1,PYPUGVATTLSFNC-FQEVSTJZSA-N,0,train
2,YGKGINZAHYHTNB-JLHYYAGUSA-N,0,train
3,WRUVHZFRRXADMC-UHFFFAOYSA-N,0,train
4,MFWNKCLOYSRHCJ-UHFFFAOYSA-N,0,train
5,KENVEDVQQZAGDU-UHFFFAOYSA-N,0,train
6,WIDBHTRKXQCNBH-UHFFFAOYSA-N,0,train
7,MELLPTHIQUBYAE-LJAQVGFWSA-N,0,train
8,PBDMWGSHTOZGOM-HNNXBMFYSA-N,0,train
9,TXYRXUAVFCQDBX-UHFFFAOYSA-N,0,train


In [56]:
#Dataset Split using Scaffold Split
split_dataset

Unnamed: 0,cmpd_id,fold,subset
0,OPEHXHIHHLIHNF-LZHZRURDSA-N,0,train
1,PYPUGVATTLSFNC-FQEVSTJZSA-N,0,train
2,HXVMFBSINMZKRO-UHFFFAOYSA-N,0,train
3,WRUVHZFRRXADMC-UHFFFAOYSA-N,0,train
4,KENVEDVQQZAGDU-UHFFFAOYSA-N,0,train
5,TULDQRGKIWKKNV-SFHVURJKSA-N,0,train
6,WIDBHTRKXQCNBH-UHFFFAOYSA-N,0,train
7,MELLPTHIQUBYAE-LJAQVGFWSA-N,0,train
8,TXYRXUAVFCQDBX-UHFFFAOYSA-N,0,train
9,VYFZIPUVTIMYCJ-NWDGAFQWSA-N,0,train


In [57]:
# Just checking to compare the split datasets
split_dataset.equals(split_dataset_randomsplit)

False

In [35]:
import atomsci.ddm.pipeline.model_pipeline as mp
import atomsci.ddm.pipeline.chem_diversity as cd
import atomsci.ddm.pipeline.parameter_parser as parse
import atomsci.ddm.pipeline.featurization as feat
import atomsci.ddm.pipeline.model_datasets as md
import atomsci.ddm.pipeline.model_wrapper as mod

import atomsci.ddm.pipeline.splitting as split
import atomsci.ddm.pipeline.perf_plots as pp
import atomsci.ddm.pipeline.model_tracker as trkr

from scipy.stats.kde import gaussian_kde
import matplotlib
import matplotlib.pyplot as plt

matplotlib.style.use('ggplot')
matplotlib.rc('xtick', labelsize=12)
matplotlib.rc('ytick', labelsize=12)
matplotlib.rc('axes', labelsize=12)

%matplotlib inline

In [15]:
# Performance table when using Scaffold Split
from atomsci.ddm.pipeline import compare_models as cmp
# Training dataset key
dataset_key = '/usr/workspace/atom/public_dsets/DTC/ml_ready/kcnh2_dtc_base_smiles_all.csv'

# Training dataset bucket
bucket = 'public'
collection = 'kcnh2_dtc'
collections = [collection]
collection_name = collection

training_perf_table = cmp.get_training_perf_table(dataset_key, bucket, collection_name)


Finding models trained on public dataset /usr/workspace/atom/public_dsets/DTC/ml_ready/kcnh2_dtc_base_smiles_all.csv
Found 10 matching models


In [16]:
training_perf_table

Unnamed: 0,best_epoch,dataset_key,dropouts,featurizer,layer_sizes,learning_rate,max_epochs,model_type,model_uuid,rf_estimators,rf_max_depth,rf_max_features,splitter,xgb_gamma,xgb_learning_rate,r2_score_train,r2_score_valid,r2_score_test
2,29,/usr/workspace/atom/public_dsets/DTC/ml_ready/...,"0.00,0.00,0.00",graphconv,128128256,0.0007,100,NN,ee6407d8-6767-4931-9799-b1c22bf23a27,,,,scaffold,,,0.850257,0.457489,0.31557
1,25,/usr/workspace/atom/public_dsets/DTC/ml_ready/...,"0.00,0.00,0.00",graphconv,25625664,0.0007,100,NN,9a82259c-0599-4e7f-acb6-269a4995c005,,,,scaffold,,,0.803471,0.449899,0.320166
0,29,/usr/workspace/atom/public_dsets/DTC/ml_ready/...,"0.00,0.00,0.00",graphconv,256256256,0.0007,100,NN,a77a9b60-dc2e-447a-92bf-7d78118c4365,,,,scaffold,,,0.856982,0.447553,0.310497
8,34,/usr/workspace/atom/public_dsets/DTC/ml_ready/...,"0.00,0.00,0.00",graphconv,6464256,0.0007,100,NN,a5054caf-476a-4cd1-a687-269f6cf9ddf1,,,,scaffold,,,0.821047,0.438644,0.316661
4,40,/usr/workspace/atom/public_dsets/DTC/ml_ready/...,"0.00,0.00,0.00",graphconv,256256128,0.0007,100,NN,12e16a16-e329-4e1b-8855-f3443ac4a31f,,,,scaffold,,,0.828562,0.426121,0.361736
5,40,/usr/workspace/atom/public_dsets/DTC/ml_ready/...,"0.00,0.00,0.00",graphconv,12812864,0.0007,100,NN,fa7d4728-26ff-46c9-9e0b-f4565683dc78,,,,scaffold,,,0.797914,0.410568,0.325595
9,45,/usr/workspace/atom/public_dsets/DTC/ml_ready/...,"0.00,0.00,0.00",graphconv,6464128,0.0007,100,NN,3d9f96f6-b15d-4c01-b4fb-ba8c017c20d2,,,,scaffold,,,0.785872,0.403416,0.26515
3,31,/usr/workspace/atom/public_dsets/DTC/ml_ready/...,"0.00,0.00,0.00",graphconv,128128128,0.0007,100,NN,0f0f3fac-72e1-4218-bcfc-67e5bf2a1780,,,,scaffold,,,0.816506,0.400714,0.341997
7,34,/usr/workspace/atom/public_dsets/DTC/ml_ready/...,"0.00,0.00,0.00",graphconv,646464,0.0007,100,NN,d7ba3898-7b87-4ed9-8720-b2c81ea120c0,,,,scaffold,,,0.745418,0.384794,0.23678
6,22,/usr/workspace/atom/public_dsets/DTC/ml_ready/...,"0.00,0.00,0.00",graphconv,646432,0.0007,100,NN,5ece022a-c70d-4494-b22c-997df414c380,,,,scaffold,,,0.710938,0.353172,0.268749
