In [1]:
#### !/usr/bin/env python
# coding: utf-8
from molmap.model import RegressionEstimator, MultiClassEstimator, MultiLabelEstimator
from molmap import loadmap
from molmap.show import imshow_wrap
import molmap
from molmap import MolMap

from sklearn.utils import shuffle 
from joblib import load, dump
import numpy as np
import pandas as pd
import os

from chembench import dataset

def get_pos_weights(trainY):
    """pos_weights: neg_n / pos_n """
    dfY = pd.DataFrame(trainY)
    pos = dfY == 1
    pos_n = pos.sum(axis=0)
    neg = dfY == 0
    neg_n = neg.sum(axis=0)
    pos_weights = (neg_n / pos_n).values
    neg_weights = (pos_n / neg_n).values
    return pos_weights, neg_weights

### optimized hyper-parameters

In [2]:
### optimized hyper-parameters
n_neighbors = 15
input_feature_maps = 'both'
batch_size = 64
lr = 1e-4
dense_layers = [128]

In [3]:
## random
data = dataset.load_Tox21()
task_name = data.task_name
task_type = data.task_type

Y = pd.DataFrame(data.y).fillna(-1).values
df = data.df
n_outputs = Y.shape[1]

gpuid = 4 # which gpu to use

random_seeds = [2, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096]

total samples: 7831


In [4]:
mp1 = loadmap('../../descriptor.mp')
mp1.fit(method = 'umap',  n_neighbors = n_neighbors)

mp2 = loadmap('../../fingerprint.mp')
mp2.fit(method = 'umap',  n_neighbors = n_neighbors)

UMAP(metric='precomputed', random_state=1, verbose=2)
Construct fuzzy simplicial set
Fri Sep 25 00:17:02 2020 Finding Nearest Neighbors
Fri Sep 25 00:17:02 2020 Finished Nearest Neighbor Search
Fri Sep 25 00:17:04 2020 Construct embedding
	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Fri Sep 25 00:17:09 2020 Finished embedding
2020-09-25 00:17:09,708 - [32mINFO[0m - [bidd-molmap][0m - Applying grid feature map(assignment), this may take several minutes(1~30 min)[0m
2020-09-25 00:17:13,731 - [32mINFO[0m - [bidd-molmap][0m - Finished[0m
UMAP(metric='precomputed', random_state=1, verbose=2)
Construct fuzzy simplicial set
Fri Sep 25 00:17:13 2020 Finding Nearest Neighbors
Fri Sep 25 00:17:13 2020 Finished Nearest Neighbor Sea

In [5]:
X1 = mp1.batch_transform(df.smiles.tolist(), n_jobs = 16)
X2 = mp2.batch_transform(df.smiles.tolist(), n_jobs = 16)

100%|##########| 7831/7831 [10:12<00:00, 12.78it/s]
100%|##########| 7831/7831 [02:38<00:00, 49.34it/s]


In [6]:
fmap_shape1= X1.shape[1:]
fmap_shape2= X2.shape[1:]

In [8]:
file_path = "/raid/shenwanxiang/08_Robustness/dataset_induces/split" #split

result_file = 'OPT_%s.csv' % task_name

with open(result_file, 'w+') as f:
    f.write('task_name, seed, valid_auc, test_auc\n')

# the dense layers for these multi outputs tasks

res = []
for seed in random_seeds:

    train_path = os.path.join(file_path, task_name,"%s" % seed, "train.csv")
    valid_path = os.path.join(file_path, task_name,"%s" % seed, "val.csv")
    test_path = os.path.join(file_path, task_name,"%s" % seed, "test.csv")

    train_df = pd.read_csv(train_path)
    valid_df = pd.read_csv(valid_path)
    test_df = pd.read_csv(test_path)

    train_idx = df[df.smiles.isin(train_df.smiles)].index.tolist()
    valid_idx = df[df.smiles.isin(valid_df.smiles)].index.tolist()
    test_idx = df[df.smiles.isin(test_df.smiles)].index.tolist()

    print(len(train_idx), len(valid_idx), len(test_idx))

    X_train = (X1[train_idx], X2[train_idx])
    y_train = Y[train_idx]

    X_valid = (X1[valid_idx], X2[valid_idx])
    y_valid = Y[valid_idx]

    X_test = (X1[test_idx], X2[test_idx])
    y_test = Y[test_idx]    

    pos_weights, neg_weights = get_pos_weights(y_train)
    loss = lambda y_true, y_pred: molmap.model.loss.weighted_cross_entropy(y_true,y_pred, pos_weights, MASK = -1)

    clf = MultiLabelEstimator(n_outputs,
                              fmap_shape1, fmap_shape2, 
                              batch_size = batch_size,
                              dense_layers = dense_layers,
                              gpuid = gpuid, loss = loss, lr = lr,
                              monitor = 'val_auc',
                             ) 
    clf.fit(X_train,y_train, X_valid, y_valid)


    train_aucs = clf._performance.evaluate(X_train,y_train)
    valid_aucs = clf._performance.evaluate(X_valid,y_valid)            
    test_aucs = clf._performance.evaluate(X_test,y_test)

    train_auc = np.nanmean(train_aucs)
    valid_auc = np.nanmean(valid_aucs)
    test_auc = np.nanmean(test_aucs)

    final_res = {'seed': seed,
                 "task_name": task_name,
                 'train_auc':train_auc, 
                 'valid_auc':valid_auc,                      
                 'test_auc':test_auc,}

    print(final_res)

    with open(result_file, 'a+') as f:
        f.write('%s, %s, %s, %s\n' % (task_name, seed, valid_auc, test_auc))

    res.append(final_res)