In [2]:
import pandas as pd
from molvs import Standardizer
from rdkit import Chem
import numpy as np
import rdkit
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from rdkit.Chem.SaltRemover import SaltRemover
from rdkit.Chem.rdMolDescriptors import GetMorganFingerprintAsBitVect
from rdkit.DataStructs.cDataStructs import ConvertToNumpyArray

In [3]:
er = pd.read_csv("../data/ER.csv")
def predefied_split(data):
    yield (data[data['Set'] == 'T'].index,data[data['Set'] == 'P'].index)

In [4]:
#Processing of molecules
s = Standardizer()
salts = SaltRemover()

def ecfp( mol, r=3, nBits=2048, errors_as_zeros=True):
    mol = Chem.MolFromSmiles(mol) if not isinstance(mol, rdkit.Chem.rdchem.Mol) else mol
    try:
        arr = np.zeros((1,))
        ConvertToNumpyArray(GetMorganFingerprintAsBitVect(mol, r, nBits), arr)
        return arr.astype(np.float32)
    except:
        return np.NaN if not errors_as_zeros else np.zeros((nBits,), dtype=np.float32)

def process(smiles): #Some of molecules are broken (2 instead of 1) we will remove this data
   try:
       m =  Chem.MolFromSmiles(smiles)
       m = s.standardize(m)
       m = salts(m)
       return Chem.MolToSmiles(m)
   except:
       return None

er['Smiles'] = er['Smiles'].apply(process)
er.dropna(inplace=True)
er['ecfp'] = er['Smiles'].apply(ecfp)

[14:49:49] SMILES Parse Error: syntax error while parsing: CCC1(c2ccccc2)C(=O)NC(=O)NC1=O,
[14:49:49] SMILES Parse Error: Failed parsing SMILES 'CCC1(c2ccccc2)C(=O)NC(=O)NC1=O,' for input: 'CCC1(c2ccccc2)C(=O)NC(=O)NC1=O,'
[14:49:49] SMILES Parse Error: syntax error while parsing: CCC1(CC)C(=O)NC(=O)NC1=O,
[14:49:49] SMILES Parse Error: Failed parsing SMILES 'CCC1(CC)C(=O)NC(=O)NC1=O,' for input: 'CCC1(CC)C(=O)NC(=O)NC1=O,'
[14:49:51] SMILES Parse Error: syntax error while parsing: Oc1cc2OC(=O)c3c(oc4cc(O)ccc43)-c2cc1,
[14:49:51] SMILES Parse Error: Failed parsing SMILES 'Oc1cc2OC(=O)c3c(oc4cc(O)ccc43)-c2cc1,' for input: 'Oc1cc2OC(=O)c3c(oc4cc(O)ccc43)-c2cc1,'
[14:49:52] SMILES Parse Error: syntax error while parsing: O=C1N(Cl)C(=O)NC(=O)N1Cl,
[14:49:52] SMILES Parse Error: Failed parsing SMILES 'O=C1N(Cl)C(=O)NC(=O)N1Cl,' for input: 'O=C1N(Cl)C(=O)NC(=O)N1Cl,'
[14:49:54] SMILES Parse Error: syntax error while parsing: CC1=NN=C(C(=O)N1N)c1ccccc1,
[14:49:54] SMILES Parse Error: Failed p

In [12]:
er['Class'].astype("category").value_counts()

0    5296
1    1979
Name: Class, dtype: int64

In [17]:
estimator = XGBClassifier(
    objective= 'binary:logistic',
    tree_method='gpu_hist',
    nthread=-1,
    seed=42
)
parameters = {
    'n_estimators': [100],
    'max_depth': range (2, 10, 1),
    'learning_rate': [0.1, 0.01, 0.05],
}

In [18]:
grid_search = GridSearchCV(
    estimator=estimator,
    param_grid=parameters,
    scoring = 'roc_auc',
    n_jobs = 1,
    cv = 5,#predefied_split(),
    verbose=2
)

In [19]:
best = grid_search.fit(np.vstack(er['ecfp'].values),er['Class'])

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] END ...learning_rate=0.1, max_depth=2, n_estimators=100; total time=   0.8s
[CV] END ...learning_rate=0.1, max_depth=2, n_estimators=100; total time=   0.8s
[CV] END ...learning_rate=0.1, max_depth=2, n_estimators=100; total time=   0.7s
[CV] END ...learning_rate=0.1, max_depth=2, n_estimators=100; total time=   0.8s
[CV] END ...learning_rate=0.1, max_depth=2, n_estimators=100; total time=   0.7s
[CV] END ...learning_rate=0.1, max_depth=3, n_estimators=100; total time=   1.4s
[CV] END ...learning_rate=0.1, max_depth=3, n_estimators=100; total time=   1.4s
[CV] END ...learning_rate=0.1, max_depth=3, n_estimators=100; total time=   1.4s
[CV] END ...learning_rate=0.1, max_depth=3, n_estimators=100; total time=   1.4s
[CV] END ...learning_rate=0.1, max_depth=3, n_estimators=100; total time=   1.4s
[CV] END ...learning_rate=0.1, max_depth=4, n_estimators=100; total time=   2.5s
[CV] END ...learning_rate=0.1, max_depth=4, n_e

[CV] END ..learning_rate=0.05, max_depth=6, n_estimators=100; total time=   6.3s
[CV] END ..learning_rate=0.05, max_depth=6, n_estimators=100; total time=   6.5s
[CV] END ..learning_rate=0.05, max_depth=6, n_estimators=100; total time=   6.1s
[CV] END ..learning_rate=0.05, max_depth=6, n_estimators=100; total time=   6.2s
[CV] END ..learning_rate=0.05, max_depth=7, n_estimators=100; total time=   9.6s
[CV] END ..learning_rate=0.05, max_depth=7, n_estimators=100; total time=   9.1s
[CV] END ..learning_rate=0.05, max_depth=7, n_estimators=100; total time=   9.0s
[CV] END ..learning_rate=0.05, max_depth=7, n_estimators=100; total time=   8.8s
[CV] END ..learning_rate=0.05, max_depth=7, n_estimators=100; total time=   8.8s
[CV] END ..learning_rate=0.05, max_depth=8, n_estimators=100; total time=  12.4s
[CV] END ..learning_rate=0.05, max_depth=8, n_estimators=100; total time=  11.5s
[CV] END ..learning_rate=0.05, max_depth=8, n_estimators=100; total time=  12.2s
[CV] END ..learning_rate=0.0