In [1]:
!pip install --upgrade pip
!pip install rdkit-pypi
!pip install PyTDC -q
!pip install -f http://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html h2o

In [4]:
# Import base libraries

import time, os
import numpy as np
import pandas as pd
import itertools
from tqdm import tqdm

from sklearn.metrics import median_absolute_error as MAE
  
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
#from rdkit.Chem import Descriptors, Descriptors3D
#from rdkit.ML.Descriptors import MoleculeDescriptors

import h2o
print(h2o.__version__)
from h2o.automl import H2OAutoML
h2o.init()

import warnings
warnings.filterwarnings("ignore")
print('Python libs loaded and autoML server initialized ......')

In [5]:
# Various competitions in the ADMET group
from tdc import utils
names = utils.retrieve_benchmark_names('ADMET_Group')
names

In [6]:
# We load the ld50 benchmark
from tdc.benchmark_group import admet_group
group = admet_group(path = 'data/')
group
benchmark = group.get('ld50_zhu')
benchmark

In [7]:
# Benchmark has a train_val and test datasets
print('train_val ', benchmark['train_val'].shape)
print('test ', benchmark['test'].shape)

In [8]:
benchmark['train_val'].head(1)

# MACCSkeys

In [9]:
dfTrain = pd.DataFrame()
dfTest = pd.DataFrame()

#Method transforms smiles strings to mol rdkit object
dfTrain['mol'] = benchmark['train_val'].Drug.apply(lambda x: Chem.MolFromSmiles(x)) 
dfTest['mol'] = benchmark['test'].Drug.apply(lambda x: Chem.MolFromSmiles(x)) 

dfTrain['smiles'] = benchmark['train_val'].Drug.values
dfTest['smiles'] = benchmark['test'].Drug.values

print(dfTrain.shape)
print(dfTest.shape)

In [10]:
from rdkit.Chem import MACCSkeys
header = ['MACCSbit' + str(i) for i in range(167)]
len(header)

In [11]:
maccs = []
for rowNum in tqdm(range(dfTrain.shape[0])):
    mol = dfTrain['mol'][rowNum]
    maccs.append(list(MACCSkeys.GenMACCSKeys(mol).ToBitString()))

maccs = pd.DataFrame(maccs, columns = header)
print(maccs.shape)
maccs

In [12]:
print(dfTrain.shape)
dfTrain = pd.concat([dfTrain,maccs], axis = 1)
print(dfTrain.shape)

In [13]:
maccs = []
for rowNum in tqdm(range(dfTest.shape[0])):
    mol = dfTest['mol'][rowNum]
    maccs.append(list(MACCSkeys.GenMACCSKeys(mol).ToBitString()))

maccs = pd.DataFrame(maccs, columns = header)
print(maccs.shape)
maccs

In [14]:
print(dfTest.shape)
dfTest = pd.concat([dfTest,maccs], axis = 1)
print(dfTest.shape)

In [15]:
# DROP constant columns 

constColsTrain = list(dfTrain.columns[dfTrain.nunique() <= 1])
print('Constant columns Train', len(constColsTrain))

constColsTest = list(dfTest.columns[dfTest.nunique() <= 1])
print('Constant columns Test', len(constColsTest))

constCols = list(set(constColsTrain + constColsTest))
print('Constant columns BOTH', len(constCols))

print('With constant cols ', dfTrain.shape)
dfTrain.drop(constCols, axis=1, inplace=True)
print('Without ', dfTrain.shape)

print('With constant cols ', dfTest.shape)
dfTest.drop(constCols, axis=1, inplace=True)
print('Without ', dfTest.shape)

In [16]:
dfTrain.drop(['mol', 'smiles'], axis=1, inplace=True)
print(dfTrain.shape)

dfTest.drop(['mol', 'smiles'], axis=1, inplace=True)
dfTest.shape

dfTrain.columns = dfTrain.columns.map(str)
dfTest.columns = dfTest.columns.map(str)
dfTest.columns 

In [17]:
%%time

#Prep the h2o autoML frames

train = pd.DataFrame()
train = pd.concat([train,dfTrain],axis=0)
train['LD50'] = benchmark['train_val'].Y.values

test = dfTest

print(train.shape)
print(test.shape)

#Prep the h2o frames
trainH2 = h2o.H2OFrame(train)
testH2 = h2o.H2OFrame(test)

x = testH2.columns
y = 'LD50'

In [18]:
# Run AutoML - set num of models and allocated time
aml = H2OAutoML(max_models = 25, seed=47, max_runtime_secs = 900)
aml.train(x=x, y=y, training_frame=trainH2)

# Predict on test
y_test = benchmark['test'].Y.values
Preds = aml.leader.predict(testH2)
y_pred_test = Preds.as_data_frame().values.flatten()
print(len(y_pred_test))
print('my MAE =',round(np.abs(y_test - y_pred_test).mean(),3))

# TDC evaluuator .......................
predictions = {}
name = benchmark['name']
predictions[name] = y_pred_test
print('TDC evaluator =', group.evaluate(predictions))

In [None]:
# TDC competition requires the model be run 5 times so the std deviation can be calculated

predictions_list = []

for seed in [1, 2, 3, 4, 5]:
    predictions = {}
    name = benchmark['name']

    ## --- train your model --- ##
    aml = H2OAutoML(max_models = 25, seed = seed, max_runtime_secs = 900)
    aml.train(x=x, y=y, training_frame=trainH2)
    # Predict on test
    Preds = aml.leader.predict(testH2)
    y_pred_test = Preds.as_data_frame().values.flatten()
    predictions[name] = y_pred_test
    print('TDC evaluator =', group.evaluate(predictions))
    predictions_list.append(predictions)

In [None]:
FinalResults = group.evaluate_many(predictions_list) 
print('FinalResults TDC evaluator on 5 runs = ', FinalResults)