In [1]:
#### !/usr/bin/env python
# coding: utf-8
from molmap.model import RegressionEstimator, MultiClassEstimator, MultiLabelEstimator
from molmap import loadmap
from molmap.show import imshow_wrap
import molmap
from molmap import MolMap

from sklearn.utils import shuffle 
from joblib import load, dump
import numpy as np
import pandas as pd
import os

from chembench import dataset

def get_pos_weights(trainY):
    """pos_weights: neg_n / pos_n """
    dfY = pd.DataFrame(trainY)
    pos = dfY == 1
    pos_n = pos.sum(axis=0)
    neg = dfY == 0
    neg_n = neg.sum(axis=0)
    pos_weights = (neg_n / pos_n).values
    neg_weights = (pos_n / neg_n).values
    return pos_weights, neg_weights

### optimized hyper-parameters

In [2]:
### optimized hyper-parameters
n_neighbors = 15
min_dist = 0.75
input_feature_maps = 'fingerprint'
batch_size = 32
dense_layers = [128, 32] #
lr = 1e-4

In [3]:
## random
data = dataset.load_BACE()
task_name = data.task_name
task_type = data.task_type

Y = data.y
df = data.df
n_outputs = Y.shape[1]

gpuid = 5 # which gpu to use

random_seeds = [2, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096]

total samples: 1513


In [4]:
mp = loadmap('../../fingerprint.mp')
mp.fit(method = 'umap', min_dist = min_dist, n_neighbors = n_neighbors)

UMAP(metric='precomputed', min_dist=0.75, random_state=1, verbose=2)
Construct fuzzy simplicial set
Thu Sep 24 13:14:15 2020 Finding Nearest Neighbors
Thu Sep 24 13:14:15 2020 Finished Nearest Neighbor Search
Thu Sep 24 13:14:17 2020 Construct embedding
	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Thu Sep 24 13:14:21 2020 Finished embedding
2020-09-24 13:14:21,167 - [32mINFO[0m - [bidd-molmap][0m - Applying grid feature map(assignment), this may take several minutes(1~30 min)[0m
2020-09-24 13:14:23,244 - [32mINFO[0m - [bidd-molmap][0m - Finished[0m


In [5]:
X = mp.batch_transform(df.smiles.tolist(),  scale=False, n_jobs = 16)

100%|##########| 1513/1513 [00:34<00:00, 44.42it/s]


In [6]:
fmap_shape1= X.shape[1:]
fmap_shape1

(37, 36, 3)

In [1]:
file_path = "/raid/shenwanxiang/08_Robustness/dataset_induces/split" #split

result_file = 'OPT_%s.csv' % task_name

with open(result_file, 'w+') as f:
    f.write('task_name, seed, valid_auc, test_auc\n')

# the dense layers for these multi outputs tasks

res = []
for seed in random_seeds:

    train_path = os.path.join(file_path, task_name,"%s" % seed, "train.csv")
    valid_path = os.path.join(file_path, task_name,"%s" % seed, "val.csv")
    test_path = os.path.join(file_path, task_name,"%s" % seed, "test.csv")

    train_df = pd.read_csv(train_path)
    valid_df = pd.read_csv(valid_path)
    test_df = pd.read_csv(test_path)

    train_idx = df[df.smiles.isin(train_df.smiles)].index.tolist()
    valid_idx = df[df.smiles.isin(valid_df.smiles)].index.tolist()
    test_idx = df[df.smiles.isin(test_df.smiles)].index.tolist()

    print(len(train_idx), len(valid_idx), len(test_idx))

    X_train = X[train_idx]
    y_train = Y[train_idx]

    X_valid = X[valid_idx]
    y_valid = Y[valid_idx]

    X_test = X[test_idx]
    y_test = Y[test_idx]    

    clf = MultiLabelEstimator(n_outputs,
                              fmap_shape1,lr = lr,
                              batch_size = batch_size,
                              dense_layers = dense_layers,
                              gpuid = gpuid, 
                              monitor = 'val_auc',
                             ) 
    clf.fit(X_train,y_train, X_valid, y_valid)


    train_aucs = clf._performance.evaluate(X_train,y_train)
    valid_aucs = clf._performance.evaluate(X_valid,y_valid)            
    test_aucs = clf._performance.evaluate(X_test,y_test)

    train_auc = np.nanmean(train_aucs)
    valid_auc = np.nanmean(valid_aucs)
    test_auc = np.nanmean(test_aucs)

    final_res = {'seed': seed,
                 "task_name": task_name,
                 'train_auc':train_auc, 
                 'valid_auc':valid_auc,                      
                 'test_auc':test_auc,}

    print(final_res)

    with open(result_file, 'a+') as f:
        f.write('%s, %s, %s, %s\n' % (task_name, seed, valid_auc, test_auc))

    res.append(final_res)