In [1]:
from maplight import *
from tqdm import tqdm
import catboost as cb

from tdc.benchmark_group import admet_group

In [2]:
benchmark_config = {
    'hia_hou': ('binary', False),
    'pgp_broccatelli': ('binary', False),
    'bioavailability_ma': ('binary', False),
    'bbb_martins': ('binary', False),
    'cyp2c9_veith': ('binary', False),
    'cyp2d6_veith': ('binary', False),
    'cyp3a4_veith': ('binary', False),
    'cyp2c9_substrate_carbonmangels': ('binary', False),
    'cyp2d6_substrate_carbonmangels': ('binary', False),
    'cyp3a4_substrate_carbonmangels': ('binary', False),
    'herg': ('binary', False),
    'ames': ('binary', False),
    'dili': ('binary', False)
}

In [3]:
group = admet_group(path = 'data/')

predictions = {}
for admet_benchmark in tqdm(benchmark_config.keys()):
    benchmark = group.get(admet_benchmark)
    name = benchmark['name']
    train, test = benchmark['train_val'], benchmark['test']

    X_train = get_fingerprints(train['Drug'])
    X_test = get_fingerprints(test['Drug'])

    task, log_scale = benchmark_config[name]
    params = {
            'random_strength': 2, 
            'random_seed': 42,
            'verbose': 0,
        }
    
    params['loss_function'] = 'Logloss'
    model = cb.CatBoostClassifier(**params)
    model.fit(X_train, train['Y'].values)

    y_pred_test = model.predict_proba(X_test)[:, 1]

    predictions[f"{name}_y_pred_proba"] = y_pred_test
    predictions[f"{name}_y_true"] = test['Y'].values

Downloading Benchmark Group...
100%|██████████████████████████████████████████████████████████████████████████| 1.47M/1.47M [00:00<00:00, 2.61MiB/s]
Extracting zip file...
Done!
100%|████████████████████████████████████████████████████████████████████████████████| 13/13 [09:23<00:00, 43.32s/it]


In [4]:
import pickle

with open('maplight_binary.pickle', 'wb') as handle:
    pickle.dump(predictions, handle, protocol=pickle.HIGHEST_PROTOCOL)