In [1]:
#### !/usr/bin/env python
# coding: utf-8
from molmap.model import RegressionEstimator, MultiClassEstimator, MultiLabelEstimator
from molmap import loadmap
from molmap.show import imshow_wrap
import molmap
from molmap import MolMap

from sklearn.utils import shuffle 
from joblib import load, dump
import numpy as np
import pandas as pd
import os

from chembench import dataset


### optimized hyper-parameters

In [2]:
### optimized hyper-parameters
n_neighbors = 15
min_dist = 0.25
input_feature_maps = 'descriptor'
batch_size = 8
dense_layers = [128]

In [3]:
## random
data = dataset.load_ESOL()
task_name = data.task_name
task_type = data.task_type

Y = data.y
df = data.df
n_outputs = Y.shape[1]

gpuid = 6

random_seeds = [2, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096]

total samples: 1128


In [4]:
mp = loadmap('../../descriptor.mp')
mp.fit(method = 'umap', min_dist = min_dist, n_neighbors = n_neighbors)

UMAP(metric='precomputed', min_dist=0.25, random_state=1, verbose=2)
Construct fuzzy simplicial set
Thu Sep 17 13:54:36 2020 Finding Nearest Neighbors
Thu Sep 17 13:54:37 2020 Finished Nearest Neighbor Search
Thu Sep 17 13:54:38 2020 Construct embedding
	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Thu Sep 17 13:54:42 2020 Finished embedding
2020-09-17 13:54:42,941 - [32mINFO[0m - [bidd-molmap][0m - Applying grid feature map(assignment), this may take several minutes(1~30 min)[0m
2020-09-17 13:54:45,681 - [32mINFO[0m - [bidd-molmap][0m - Finished[0m


In [5]:
X = mp.batch_transform(df.smiles.tolist(),  n_jobs = 16)

100%|##########| 1128/1128 [01:15<00:00, 15.01it/s]


In [6]:
fmap_shape1 = X.shape[1:]

In [7]:
file_path = "/raid/shenwanxiang/08_Robustness/dataset_induces/split" #split

result_file = 'OPT_%s.csv' % task_name

with open(result_file, 'w+') as f:
    f.write('task_name, seed, valid_rmse, test_rmse\n')

# the dense layers for these multi outputs tasks

res = []
for seed in random_seeds:

    train_path = os.path.join(file_path, task_name,"%s" % seed, "train.csv")
    valid_path = os.path.join(file_path, task_name,"%s" % seed, "val.csv")
    test_path = os.path.join(file_path, task_name,"%s" % seed, "test.csv")

    train_df = pd.read_csv(train_path)
    valid_df = pd.read_csv(valid_path)
    test_df = pd.read_csv(test_path)

    train_idx = df[df.smiles.isin(train_df.smiles)].index.tolist()
    valid_idx = df[df.smiles.isin(valid_df.smiles)].index.tolist()
    test_idx = df[df.smiles.isin(test_df.smiles)].index.tolist()

    print(len(train_idx), len(valid_idx), len(test_idx))

    X_train = X[train_idx]
    y_train = Y[train_idx]

    X_valid = X[valid_idx]
    y_valid = Y[valid_idx]

    X_test = X[test_idx]
    y_test = Y[test_idx]    

    clf = RegressionEstimator(n_outputs,
                                  fmap_shape1,
                                  batch_size = batch_size,
                                  dense_layers = dense_layers,
                                  gpuid = gpuid, 
                             ) 
    clf.fit(X_train,y_train, X_valid, y_valid)

    train_rmses, train_r2s = clf._performance.evaluate(X_train,y_train)
    valid_rmses, valid_r2s = clf._performance.evaluate(X_valid,y_valid)            
    test_rmses, test_r2s = clf._performance.evaluate(X_test,y_test)

    train_rmse = np.nanmean(train_rmses)
    valid_rmse = np.nanmean(valid_rmses)
    test_rmse = np.nanmean(test_rmses)

    final_res = {'seed': seed,
                 "task_name": task_name,
                 'train_rmse':train_rmse, 
                 'valid_rmse':valid_rmse,                      
                 'test_rmse':test_rmse,}
    print(final_res)

    with open(result_file, 'a+') as f:
        f.write('%s, %s, %s, %s\n' % (task_name, seed, valid_rmse, test_rmse))

    res.append(final_res)


904 112 112
RegressionEstimator(batch_size=8, dense_layers=[128], gpuid='6')
epoch: 0001, loss: 4.9304 - val_loss: 3.8446; rmse: 1.9477 - rmse_val: 1.9608;  r2: 0.4370 - r2_val: 0.5272                                                                                                    
epoch: 0002, loss: 3.3967 - val_loss: 2.8387; rmse: 1.7045 - rmse_val: 1.6848;  r2: 0.5271 - r2_val: 0.5915                                                                                                    
epoch: 0003, loss: 2.4921 - val_loss: 1.9237; rmse: 1.4219 - rmse_val: 1.3870;  r2: 0.6251 - r2_val: 0.6626                                                                                                    
epoch: 0004, loss: 1.6742 - val_loss: 1.3995; rmse: 1.1654 - rmse_val: 1.1830;  r2: 0.7314 - r2_val: 0.7351                                                                                                    
epoch: 0005, loss: 1.2719 - val_loss: 1.2716; rmse: 1.0599 - rmse_val: 1.1276;  r2: 0.7699 

In [8]:
pd.DataFrame(res).to_csv(result_file + '.bak.csv')