In [1]:
# Tensorflow uses CPU instead of GPU device
import os
%env CUDA_VISIBLE_DEVICES=0
%env KERAS_BACKEND=tensorflow

env: CUDA_VISIBLE_DEVICES=0
env: KERAS_BACKEND=tensorflow


In [2]:
import os
os.path

<module 'posixpath' from '/home/u3/navaro/.conda/envs/tensorflow-gpu-1.15.2/lib/python3.6/posixpath.py'>

In [3]:
import pandas as pd

from SMILESX import main, inference
%matplotlib inline

Using TensorFlow backend.




Device mapping:
/job:localhost/replica:0/task:0/device:XLA_CPU:0 -> device: XLA_CPU device
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: Tesla V100-PCIE-16GB, pci bus id: 0000:0b:00.0, compute capability: 7.0
/job:localhost/replica:0/task:0/device:XLA_GPU:0 -> device: XLA_GPU device

Device mapping:
/job:localhost/replica:0/task:0/device:XLA_CPU:0 -> device: XLA_CPU device
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: Tesla V100-PCIE-16GB, pci bus id: 0000:0b:00.0, compute capability: 7.0
/job:localhost/replica:0/task:0/device:XLA_GPU:0 -> device: XLA_GPU device



#### Definition of data

In [4]:
validation_data_dir = "validation_data/"

In [5]:
extension = '.csv'

In [6]:
data_name = 'FreeSolv' # FreeSolv, ESOL, Lipophilicity
prop_tag = ''

In [7]:
if data_name == 'FreeSolv':
    data_filename = 'FreeSolv_SAMPL'
    prop_tag = 'expt'
elif data_name == 'ESOL':
    data_filename = 'ESOL_delaney-processed'
    prop_tag = 'measured log solubility in mols per litre'
elif data_name == 'Lipophilicity':
    data_filename = 'Lipophilicity'
    prop_tag = 'exp'
else:
    data_filename = data_name
    prop_tag = prop_tag

In [8]:
import os
try:
    data_file = os.path.join(validation_data_dir,data_filename+extension)
    sol_data = pd.read_csv(os.path.join(data_file))
except FileNotFoundError:
    print(f"file {data_file} doesn't exist")

In [9]:
sol_data.head(3)

Unnamed: 0.1,Unnamed: 0,iupac,smiles,expt,calc
0,0,"4-methoxy-N,N-dimethyl-benzamide",COc1ccc(C(=O)N(C)C)cc1,-11.01,-9.625
1,1,methanesulfonyl chloride,CS(=O)(=O)Cl,-4.87,-6.219
2,2,3-methylbut-1-ene,C=CC(C)C,1.83,2.452


In [10]:
sol_data = sol_data[['smiles',prop_tag]]

In [11]:
sol_data.head()

Unnamed: 0,smiles,expt
0,COc1ccc(C(=O)N(C)C)cc1,-11.01
1,CS(=O)(=O)Cl,-4.87
2,C=CC(C)C,1.83
3,CCc1cnccn1,-5.45
4,CCCCCCCO,-4.21


In [12]:
sol_data.shape

(642, 2)

#### Hyperparameters optimization with GPyOpt (Bayesian optimization)

In [13]:
dhyp_range = [int(2**itn) for itn in range(3,11)] # 
dalpha_range = [float(ialpha/10.) for ialpha in range(20,40,1)] # Adam's learning rate = 10^(-dalpha_range)

if data_name != 'Lipophilicity':
    bounds = [
        {'name': 'lstmunits', 'type': 'discrete', 'domain': dhyp_range}, 
        {'name': 'denseunits', 'type': 'discrete', 'domain': dhyp_range}, 
        {'name': 'embedding', 'type': 'discrete', 'domain': dhyp_range}, 
        {'name': 'batchsize', 'type': 'discrete', 'domain': dhyp_range}, 
        {'name': 'lrate', 'type': 'discrete', 'domain': dalpha_range}
    ]
else:
    bounds = [
        {'name': 'lstmunits', 'type': 'discrete', 'domain': dhyp_range}, 
        {'name': 'denseunits', 'type': 'discrete', 'domain': dhyp_range}, 
        {'name': 'embedding', 'type': 'discrete', 'domain': dhyp_range}, 
        {'name': 'batchsize', 'type': 'discrete', 'domain': (1024, 1024)}, # fixed
        {'name': 'lrate', 'type': 'discrete', 'domain': (3, 3)} # fixed
    ]

In [14]:
%%time
main.Main(data=sol_data, 
          data_name=data_name, 
          data_units='', 
          bayopt_bounds=bounds, 
          k_fold_number = 3, 
          augmentation = True, 
          outdir = "./data/", 
          bayopt_n_epochs = 1,
          bayopt_n_rounds = 1, 
          bayopt_it_factor = 1, 
          bayopt_on = True, 
          lstmunits_ref = 128, 
          denseunits_ref = 16, 
          embedding_ref = 32, 
          n_gpus = 1, 
          bridge_type = 'NVLink', 
          batch_size_ref = 8,
          alpha_ref = 22, 
          patience = 50, 
          n_epochs = 100)

***SMILES_X starts...***


******
***Fold #0 initiated...***
******
***Sampling and splitting of the dataset.***

Scaler: RobustScaler(copy=True, quantile_range=(5.0, 95.0), with_centering=True,
             with_scaling=True)
Train/valid/test splits: 0.80/0.10/0.10


***Data augmentation to True***

Enumerated SMILES:
	Training set: 4352
	Validation set: 474
	Test set: 529

***Tokenization of SMILES.***

Examples of tokenized SMILES from a training set:
[[' ', 'C', '=', 'C', 'C', 'C', ' '], [' ', 'C', '(', 'C', 'C', ')', '=', 'C', ' '], [' ', 'C', '(', 'C', ')', 'C', '=', 'C', ' '], [' ', 'C', 'C', 'C', '=', 'C', ' '], [' ', 'c', '1', 'c', 'c', 'c', '2', 'n', 'c', 'c', 'c', 'c', '2', 'c', '1', ' ']]

Number of tokens only present in a training set: 33

Number of tokens only present in a validation set: 22
Is the validation set a subset of the training set: True
What are the tokens by which they differ: set()

Number of tokens only present in a test set: 26
Is the test set a subset of 

In [15]:
pred_from_ens = inference.Inference(data_name=data_name, 
                                    smiles_list = ['CC','CCC','C=O','ABC','DEF'], 
                                    data_units = '',
                                    k_fold_number = 3,
                                    augmentation = True, 
                                    outdir = "./data/")

***SMILES_X for inference starts...***


***Checking the SMILES list for inference***

***Data augmentation.***

Enumerated SMILES: 5

***Tokenization of SMILES.***



RDKit ERROR: [10:09:27] SMILES Parse Error: syntax error while parsing: ABC
RDKit ERROR: [10:09:27] SMILES Parse Error: Failed parsing SMILES 'ABC' for input: 'ABC'
RDKit ERROR: [10:09:27] SMILES Parse Error: syntax error while parsing: DEF
RDKit ERROR: [10:09:27] SMILES Parse Error: Failed parsing SMILES 'DEF' for input: 'DEF'


Full vocabulary: ['pad', 'unk', 'F', '[C@@]', '-', '/', '1', '2', '[O-]', 'Cl', 'c', '\\', '4', 'N', 'S', 's', ')', 'P', '5', 'C', '[C@]', 'n', '[S+2]', '#', 'Br', '[nH]', '[N+]', '3', ' ', 'O', 'I', '[C@@H]', '[C@H]', '=', '(']
Of size: 35

Maximum length of tokenized SMILES: 51 tokens

***Inference of SMILES property done.***


In [16]:
pred_from_ens

Unnamed: 0,SMILES,ens_pred_mean,ens_pred_sd
0,CC,0.380334,0.0177944
1,CCC,0.402792,0.00697249
2,C=O,-0.0829363,0.0270584
