In [1]:
import rdkit
import pandas as pd
from rdkit import Chem
import numpy as np
from tune_sklearn import TuneSearchCV
from rdkit.Chem.rdMolDescriptors import GetMorganFingerprintAsBitVect
from rdkit.DataStructs.cDataStructs import ConvertToNumpyArray
import molvs
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from rdkit.Chem.SaltRemover import SaltRemover

def ecfp( mol, r=3, nBits=2048, errors_as_zeros=True):
    mol = Chem.MolFromSmiles(mol) if not isinstance(mol, rdkit.Chem.rdchem.Mol) else mol
    try:
        arr = np.zeros((1,))
        ConvertToNumpyArray(GetMorganFingerprintAsBitVect(mol, r, nBits), arr)
        return arr.astype(np.float32)
    except:
        return np.NaN if not errors_as_zeros else np.zeros((nBits,), dtype=np.float32)


In [2]:
ames = pd.read_csv("../data/Mutagenicity_N6512.csv")[['Canonical_Smiles','Activity']].rename(columns={'Canonical_Smiles':'smiles',"Activity":"y"})
er = pd.read_csv("../data/ER.csv")[['Smiles','Class']].rename(columns={'Smiles':'smiles',"Class":"y"})
s = molvs.Standardizer()
salts = SaltRemover()

def process(smiles): #Some of molecules are broken (2 instead of 1) we will remove this data
   try:
       m =  Chem.MolFromSmiles(smiles)
       m = s.standardize(m)
       m = salts(m)
       arr = np.zeros((1,))
       ConvertToNumpyArray(GetMorganFingerprintAsBitVect(m, 3, 2048), arr)
       return arr.astype(np.float32)
   except:
       return None



In [3]:
er['X'] = er['smiles'].apply(process)
ames['X'] = ames['smiles'].apply(process)
er.dropna(inplace=True)
ames.dropna(inplace=True)


[16:59:40] SMILES Parse Error: syntax error while parsing: CCC1(c2ccccc2)C(=O)NC(=O)NC1=O,
[16:59:40] SMILES Parse Error: Failed parsing SMILES 'CCC1(c2ccccc2)C(=O)NC(=O)NC1=O,' for input: 'CCC1(c2ccccc2)C(=O)NC(=O)NC1=O,'
[16:59:40] SMILES Parse Error: syntax error while parsing: CCC1(CC)C(=O)NC(=O)NC1=O,
[16:59:40] SMILES Parse Error: Failed parsing SMILES 'CCC1(CC)C(=O)NC(=O)NC1=O,' for input: 'CCC1(CC)C(=O)NC(=O)NC1=O,'
[16:59:42] SMILES Parse Error: syntax error while parsing: Oc1cc2OC(=O)c3c(oc4cc(O)ccc43)-c2cc1,
[16:59:42] SMILES Parse Error: Failed parsing SMILES 'Oc1cc2OC(=O)c3c(oc4cc(O)ccc43)-c2cc1,' for input: 'Oc1cc2OC(=O)c3c(oc4cc(O)ccc43)-c2cc1,'
[16:59:44] SMILES Parse Error: syntax error while parsing: O=C1N(Cl)C(=O)NC(=O)N1Cl,
[16:59:44] SMILES Parse Error: Failed parsing SMILES 'O=C1N(Cl)C(=O)NC(=O)N1Cl,' for input: 'O=C1N(Cl)C(=O)NC(=O)N1Cl,'
[16:59:46] SMILES Parse Error: syntax error while parsing: CC1=NN=C(C(=O)N1N)c1ccccc1,
[16:59:46] SMILES Parse Error: Failed p

In [None]:
def gridsearch(data):
    estimator = XGBClassifier(
        objective='binary:logistic',
        tree_method='gpu_hist',
        seed=42
    )
    parameters = {
        'n_estimators': [100],
        'max_depth': list(range(2, 10, 1)),
        'learning_rate': [0.1, 0.01, 0.05],
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
    }
    grid_search = TuneSearchCV(
        estimator=estimator,
        param_distributions=parameters,
        n_trials=500,
        early_stopping=True,
        #param_grid=parameters,
        scoring='roc_auc',
        use_gpu=True,
        n_jobs=6,
        search_optimization='bayesian',
        # resources_per_trial={'gpu': 1},
        cv=5,  #predefied_split(),
        verbose=3
    )
    return grid_search.fit(np.vstack(data['X'].values),np.vstack(data['y'].values))



In [None]:
#ames_grid = gridsearch(ames)
er_grid = gridsearch(er)



Trial name,status,loc,colsample_bytree,gamma,learning_rate,max_depth,min_child_weight,n_estimators,subsample,iter,total time (s),split0_test_score,split1_test_score,split2_test_score
_Trainable_71ad8770,RUNNING,131.130.129.247:868507,0.8,1.0,0.05,9,5,100,0.6,,,,,
_Trainable_8363344c,RUNNING,131.130.129.247:868503,0.8,1.0,0.05,9,5,100,0.6,,,,,
_Trainable_85e69420,RUNNING,131.130.129.247:868509,0.8,1.0,0.05,9,5,100,0.6,,,,,
_Trainable_955ea564,RUNNING,131.130.129.247:868505,0.8,1.0,0.1,8,5,100,0.6,,,,,
_Trainable_a959d3ea,RUNNING,131.130.129.247:868475,1.0,1.0,0.05,7,5,100,0.6,,,,,
_Trainable_b0a8e550,PENDING,,1.0,1.0,0.05,7,5,100,0.6,,,,,
_Trainable_03e0252e,TERMINATED,131.130.129.247:868505,0.8,1.5,0.05,9,1,100,0.6,1.0,185.107,0.690674,0.687518,0.730188
_Trainable_04d52ff0,TERMINATED,131.130.129.247:868505,1.0,1.5,0.05,9,5,100,0.8,1.0,154.949,0.692321,0.694598,0.726721
_Trainable_086218ea,TERMINATED,131.130.129.247:868503,1.0,1.5,0.05,9,5,100,0.6,1.0,144.543,0.690679,0.694409,0.727335
_Trainable_0c9b60e4,TERMINATED,131.130.129.247:868509,0.8,1.5,0.01,6,1,100,0.8,1.0,145.16,0.659714,0.669275,0.699109


Result for _Trainable_7fcc542a:
  average_test_score: 0.681839684569535
  date: 2022-12-06_10-01-30
  done: true
  experiment_id: b4c888095e9841ceae107d68097526ab
  hostname: A771-PHI-247
  iterations_since_restore: 1
  node_ip: 131.130.129.247
  objective: 0.681839684569535
  pid: 868503
  split0_test_score: 0.6600995793630354
  split1_test_score: 0.6704855447773295
  split2_test_score: 0.68959424271039
  split3_test_score: 0.7089175513396477
  split4_test_score: 0.6801015046572726
  time_since_restore: 30.25159788131714
  time_this_iter_s: 30.25159788131714
  time_total_s: 30.25159788131714
  timestamp: 1670317290
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 7fcc542a
  warmup_time: 0.023080825805664062
  
Result for _Trainable_7fcf8b90:
  average_test_score: 0.6949103147236958
  date: 2022-12-06_10-01-36
  done: true
  experiment_id: 669c69307b2c41e7be55d2174981d5ad
  hostname: A771-PHI-247
  iterations_since_restore: 1
  node_ip: 131.130.129.247
  objective: 0.69

Result for _Trainable_a999dc8c:
  average_test_score: 0.7001643034044113
  date: 2022-12-06_10-04-13
  done: true
  experiment_id: b4c888095e9841ceae107d68097526ab
  hostname: A771-PHI-247
  iterations_since_restore: 1
  node_ip: 131.130.129.247
  objective: 0.7001643034044113
  pid: 868503
  split0_test_score: 0.6819552942074187
  split1_test_score: 0.6788601310555985
  split2_test_score: 0.717190793678046
  split3_test_score: 0.7300221287473413
  split4_test_score: 0.6927931693336518
  time_since_restore: 99.7350549697876
  time_this_iter_s: 99.7350549697876
  time_total_s: 99.7350549697876
  timestamp: 1670317453
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: a999dc8c
  warmup_time: 0.023080825805664062
  
Result for _Trainable_be34d5a2:
  average_test_score: 0.6967668330024571
  date: 2022-12-06_10-04-17
  done: true
  experiment_id: 0a74f61f145b4db1a43d3f7b615e26a7
  hostname: A771-PHI-247
  iterations_since_restore: 1
  node_ip: 131.130.129.247
  objective: 0.69

Result for _Trainable_7e3abaa6:
  average_test_score: 0.6851349447841868
  date: 2022-12-06_10-08-51
  done: true
  experiment_id: 669c69307b2c41e7be55d2174981d5ad
  hostname: A771-PHI-247
  iterations_since_restore: 1
  node_ip: 131.130.129.247
  objective: 0.6851349447841868
  pid: 868507
  split0_test_score: 0.6672401541381712
  split1_test_score: 0.6724325407044953
  split2_test_score: 0.6955091519539112
  split3_test_score: 0.7223378735418395
  split4_test_score: 0.6681550035825172
  time_since_restore: 36.81596922874451
  time_this_iter_s: 36.81596922874451
  time_total_s: 36.81596922874451
  timestamp: 1670317731
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 7e3abaa6
  warmup_time: 0.020880699157714844
  
Result for _Trainable_83caaeb8:
  average_test_score: 0.6851349447841868
  date: 2022-12-06_10-08-53
  done: true
  experiment_id: 0a74f61f145b4db1a43d3f7b615e26a7
  hostname: A771-PHI-247
  iterations_since_restore: 1
  node_ip: 131.130.129.247
  objective: 

Result for _Trainable_bcac0ea2:
  average_test_score: 0.7074767038964485
  date: 2022-12-06_10-11-48
  done: true
  experiment_id: 6f05d79c8be644a7aea8ef8d7adb62c1
  hostname: A771-PHI-247
  iterations_since_restore: 1
  node_ip: 131.130.129.247
  objective: 0.7074767038964485
  pid: 868509
  split0_test_score: 0.6888609894983833
  split1_test_score: 0.6967956238494483
  split2_test_score: 0.7267075857727415
  split3_test_score: 0.723509886399405
  split4_test_score: 0.7015094339622642
  time_since_restore: 89.2468056678772
  time_this_iter_s: 89.2468056678772
  time_total_s: 89.2468056678772
  timestamp: 1670317908
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: bcac0ea2
  warmup_time: 0.019457340240478516
  
Result for _Trainable_ce599aac:
  average_test_score: 0.7074767038964485
  date: 2022-12-06_10-12-24
  done: true
  experiment_id: da0c4cd6f0f549c2bef33ff37f34a425
  hostname: A771-PHI-247
  iterations_since_restore: 1
  node_ip: 131.130.129.247
  objective: 0.70

Result for _Trainable_72a75f90:
  average_test_score: 0.7008889113073147
  date: 2022-12-06_10-17-36
  done: true
  experiment_id: da0c4cd6f0f549c2bef33ff37f34a425
  hostname: A771-PHI-247
  iterations_since_restore: 1
  node_ip: 131.130.129.247
  objective: 0.7008889113073147
  pid: 868505
  split0_test_score: 0.6805722474985931
  split1_test_score: 0.6886773781249702
  split2_test_score: 0.7231271639911866
  split3_test_score: 0.7247379364943105
  split4_test_score: 0.6873298304275137
  time_since_restore: 126.24018406867981
  time_this_iter_s: 126.24018406867981
  time_total_s: 126.24018406867981
  timestamp: 1670318256
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 72a75f90
  warmup_time: 0.02291250228881836
  
Result for _Trainable_87ef9250:
  average_test_score: 0.7008889113073147
  date: 2022-12-06_10-17-45
  done: true
  experiment_id: b4c888095e9841ceae107d68097526ab
  hostname: A771-PHI-247
  iterations_since_restore: 1
  node_ip: 131.130.129.247
  objective

Result for _Trainable_0e82e920:
  average_test_score: 0.7055703104859958
  date: 2022-12-06_10-21-58
  done: true
  experiment_id: 0a74f61f145b4db1a43d3f7b615e26a7
  hostname: A771-PHI-247
  iterations_since_restore: 1
  node_ip: 131.130.129.247
  objective: 0.7055703104859958
  pid: 868475
  split0_test_score: 0.6923209908337388
  split1_test_score: 0.694598248776719
  split2_test_score: 0.7267207008708425
  split3_test_score: 0.725298308867714
  split4_test_score: 0.6889133030809649
  time_since_restore: 154.7320671081543
  time_this_iter_s: 154.7320671081543
  time_total_s: 154.7320671081543
  timestamp: 1670318518
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 0e82e920
  warmup_time: 0.018968820571899414
  
Result for _Trainable_129a09a8:
  average_test_score: 0.7086151508119133
  date: 2022-12-06_10-22-52
  done: true
  experiment_id: 669c69307b2c41e7be55d2174981d5ad
  hostname: A771-PHI-247
  iterations_since_restore: 1
  node_ip: 131.130.129.247
  objective: 0.

Result for _Trainable_a851a898:
  average_test_score: 0.6986429809310546
  date: 2022-12-06_10-28-06
  done: true
  experiment_id: 669c69307b2c41e7be55d2174981d5ad
  hostname: A771-PHI-247
  iterations_since_restore: 1
  node_ip: 131.130.129.247
  objective: 0.6986429809310546
  pid: 868507
  split0_test_score: 0.6905325683654295
  split1_test_score: 0.6943037552102708
  split2_test_score: 0.7182924619185242
  split3_test_score: 0.7069908242004559
  split4_test_score: 0.6830952949605923
  time_since_restore: 223.10791063308716
  time_this_iter_s: 223.10791063308716
  time_total_s: 223.10791063308716
  timestamp: 1670318886
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: a851a898
  warmup_time: 0.020880699157714844
  
Result for _Trainable_c54c87f6:
  average_test_score: 0.6986429809310546
  date: 2022-12-06_10-28-18
  done: true
  experiment_id: 6f05d79c8be644a7aea8ef8d7adb62c1
  hostname: A771-PHI-247
  iterations_since_restore: 1
  node_ip: 131.130.129.247
  objectiv

In [26]:
import joblib
ames_grid.best_params_
#joblib.dump(ames_grid,'ames_bayesian.bin')



{'n_estimators': 100,
 'max_depth': 8,
 'learning_rate': 0.1,
 'min_child_weight': 1,
 'gamma': 0.5,
 'subsample': 0.8,
 'colsample_bytree': 1.0}