## Deepchem DataLoader

In [1]:
from visar.deepchem_utils import prepare_dataset

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [None]:
para_dict_visar = {
    'model_name': 'baseline_reg',
    'task_list': ['T107'],
    'eval_type': 'regression',
    # input data related params:
    'dataset_file': '../data/MT_data_clean_June28.csv',
    'feature_type': 'Circular_2048',
    'id_field': 'molregno',
    'smiles_field': 'salt_removed_smi',
    'model_flag': 'MT',
    'add_features': None,
    'frac_train': 0.8,
    'rand_seed': 0,
    # model architecture related parameters:
    'baseline_type': 'RidgeCV'
}

In [None]:
train_loader, test_loader, train_df, test_df = prepare_dataset(para_dict_visar)

In [None]:
train_loader.X.shape, train_loader.y.shape, train_df.shape

## baseline class

In [None]:
from visar.visar_utils import update_bicluster
from visar.VISAR_model import visar_model

In [None]:
baseline_model = visar_model(para_dict_visar)
baseline_model.model_init()
baseline_model.fit(train_loader)

In [None]:
print(baseline_model.model.coef_[0:10])

In [None]:
baseline_model.generate_viz_results(train_loader, train_df, '/ridgeCV_')

## deepchem class

In [2]:
para_dict_DC_robustMT = {
    'model_name': 'DC_RobustMT_reg',
    'task_list': ['T107', 'T108'],
    'eval_type': 'regression',
    # input data related params:
    'dataset_file': '../data/MT_data_clean_June28.csv',
    'feature_type': 'Circular_2048',
    'id_field': 'molregno',
    'smiles_field': 'salt_removed_smi',
    'model_flag': 'MT',
    'add_features': None,
    'frac_train': 0.8,
    'rand_seed': 0,
    # model architecture related parameters:
    'layer_sizes': [128, 64],
    'bypass_layer_sizes': [64],
    'dropouts': 0.5,
    'bypass_dropouts': 0.5,
    # model training related parameters:
    'learning_rate': 0.001,
    'GPU': False,
    'epoch': 40, # training epoch of each round (saving model at the end of each round)
    'epoch_num': 20, # how many rounds
    # viz file processing related parameters:
    'valid_cutoff': None, 
    'n_layer': 2
}


In [3]:
train_loader, test_loader, train_df, test_df = prepare_dataset(para_dict_DC_robustMT)

  if (await self.run_code(code, result,  async_=asy)):


Extracted dataset shape: (3471, 4)
Loading raw samples now.
shard_size: 8192
About to start loading CSV from tmp.csv
Loading shard 1 of size 8192.
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 2000
Featurizing sample 3000
TIMING: featurizing shard 0 took 14.182 s
TIMING: dataset construction took 14.399 s
Loading dataset from disk.
Computing train/valid/test indices
TIMING: dataset construction took 0.339 s
Loading dataset from disk.
TIMING: dataset construction took 0.184 s
Loading dataset from disk.


In [42]:
import os
import json

import deepchem as dc
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans

from visar.deepchem_utils import (
    prepare_dataset,
    ST_model_layer1,
    ST_model_layer2
    )

from visar.VISAR_model import visar_model
from visar.visar_utils import update_bicluster, FP_dim

class deepchem_robust_regressor(visar_model):
    def __init__(self, para_dict, *args, **kwargs):
        super().__init__(para_dict, *args, **kwargs)

        # set default parameters

        # extract model_related parameters
        if self.para_dict['add_features'] is None:
            self.n_tasks = len(self.para_dict['task_list'])
        else:
            self.n_tasks = len(self.para_dict['task_list']) + len(self.para_dict['add_features'])
        self.n_features = FP_dim[self.para_dict['feature_type']]
        self.layer_sizes = self.para_dict['layer_sizes']
        self.bypass_layer_sizes = self.para_dict['bypass_layer_sizes']
        self.dropout = self.para_dict['dropouts']
        self.bypass_dropouts = self.para_dict['bypass_dropouts']

        # get training params
        self.lr = self.para_dict['learning_rate']
        self.epoch_num = self.para_dict['epoch_num']
        self.epoch = self.para_dict['epoch']


    def model_init(self):
        self.model = dc.models.RobustMultitaskRegressor(n_tasks = self.n_tasks, 
                                n_features = self.n_features, layer_sizes = self.layer_sizes,
                                               bypass_layer_sizes=self.bypass_layer_sizes, 
                                               bypass_dropouts = self.bypass_dropouts,
                                               dropouts = self.dropout, learning_rate = self.lr)
        self.model.model_dir = self.save_path
        return

    def predict(self, data_loader):
        return self.model.predict(data_loader)

    def evaluate(self, data_loader):
        if self.para_dict['eval_type'] == 'regression':
            # only mean r2 score for now
            metric = dc.metrics.Metric(
                dc.metrics.r2_score, np.mean, mode = 'regression')
            scores = self.model.evaluate(data_loader, [metric], [], per_task_metrics=metric)
            return scores
        
        elif self.para_dict['eval_type'] == 'classification':
            pass

    def fit(self, train_loader, test_loader):
        train_evaluation = [train_loader.get_task_names()]
        test_evaluation = [train_loader.get_task_names()]
        for iteration in range(self.epoch_num):
            self.model.fit(train_loader, nb_epoch = self.epoch, max_checkpoints_to_keep = 1, checkpoint_interval=20)
            print('======== Iteration %d ======' % iteration)
            print("Evaluating model")
            train_scores = self.evaluate(train_loader)
            train_evaluation.append(train_scores[1]["mean-r2_score"])
            print("Training R2 score: %f" % train_scores[0]["mean-r2_score"])
            test_scores = self.evaluate(test_loader)
            test_evaluation.append(test_scores[1]["mean-r2_score"])
            print("Test R2 score: %f" % test_scores[0]["mean-r2_score"])
        
            # save evaluation scores
            train_df = pd.DataFrame(np.array(train_evaluation))
            test_df = pd.DataFrame(np.array(test_evaluation))
            train_df.to_csv(self.save_path + '/train_log.csv', index = None)
            test_df.to_csv(self.save_path + '/test_log.csv', index = None)

    # --------------------------------
    def save_param(self, path = None):
        if path==None:
            filepath = os.path.join(self.model_path, 'train_parameters.json')
        else:
            filepath = os.path.join(path, 'train_parameters.json')
        with open(filepath, 'w') as f:
            json.dump(self.para_dict, f, indent=2)

    def load_param(self, path = None):
        if path == None:
            filepath = os.path.join(self.model_path, 'train_parameters.json')
        else:
            filepath = os.path.join(path, 'train_parameters.json')
        if os.path.exists(filepath):
            return json.load(open(filepath, 'r'))
        return None

    def load_model(self, prev_model):
        self.model.restore(checkpoint = prev_model)
        return

    # --------------------------------
    def get_weights_RobustMT(self, layer_variables):
        with self.model._get_tf("Graph").as_default():
            w1 = model.session.run(layer_variables[0])
            b1 = model.session.run(layer_variables[1])
        return [w1, b1]

    def get_transfer_model(self, n_layer = 2):
        # load previous parameters
        tot_layer_variables = self.model.get_variables()
        param1 = self.get_weights_RobustMT(self.model, [tot_layer_variables[0], tot_layer_variables[1]])
        param2 = self.get_weights_RobustMT(self.model, [tot_layer_variables[2], tot_layer_variables[3]])
    
        n_features = param1[0].shape[0]
        layer_size = [param1[0].shape[1], param2[0].shape[1]]
    
        if n_layer == 1:
            transfer_model = ST_model_layer1(self.n_features, layer_size, [param1, param2])
        elif n_layer == 2:
            transfer_model = ST_model_layer2(self.n_features, layer_size, [param1, param2])
        else:
            print('invalid layer size!')
        return transfer_model

    def get_coords(self, transfer_model, train_loader, custom_loader = None, mode = 'default'):
        if mode == 'default':
            transfer_values = transfer_model.predict(train_loader.X)
            N_training = train_loader.X.shape[0]
            if not custom_loader is None:
                transfer_values2 = transfer_model.predict(custom_loader.X)
                N_custom = len(custom_loader.X)
                transfer_values = np.concatenate((transfer_values, transfer_values2), axis = 0)

            pca = PCA(n_components = 20)
            value_reduced_20d = pca.fit_transform(transfer_values)
            tsne = TSNE(n_components = 2)
            value_reduced = tsne.fit_transform(value_reduced_20d)

            if not custom_loader is None:
                return value_reduced[0:N_training,:], value_reduced[N_training:(N_training+N_custom),:]
            else:
                return value_reduced, None

    # gradient calculation
    def calculate_gradients(self, X_train, task_tensor_name, prev_model):
        '''
        Calculate the gradients for each chemical
        input: X_train --- fingerprint matrix of the chemicals of interest
               prev_model -- trained neural network model
        output: the gradient matrix
        '''
        feed_dict = {}

        with tf.Graph().as_default():
            with tf.Session() as sess:
                K.set_session(sess)

                new_saver = tf.train.import_meta_graph(prev_model + '.meta')
                new_saver.restore(sess, prev_model)
                graph = tf.get_default_graph()

                feed_dict['Feature_8/PlaceholderWithDefault:0'] = X_train
                #feed_dict['Dense_7/Dense_7/Relu:0'] = X_train[0:10,0:512]
                feed_dict['Placeholder:0'] = 1.0

                op_tensor = graph.get_tensor_by_name(task_tensor_name)
                X = graph.get_tensor_by_name('Feature_8/PlaceholderWithDefault:0')
                #X = graph.get_tensor_by_name('Dense_7/Dense_7/Relu:0')

                reconstruct = tf.gradients(op_tensor, X)[0]
                out = sess.run(reconstruct, feed_dict = feed_dict)[0]

        K.clear_session()
        return out

    def generate_task_df(self, dataset, prev_model, valid_mask):
        n_bypass = len(self.bypass_layer_sizes)
        TASK_LAYERS = ['Dense_%d/Dense_%d/Relu:0' % (10 + n_bypass * 2 * idx, 10 + n_bypass * 2 * idx)
                        for idx in range(n_tasks)]
        TASK_LAYERS = list(np.array(TASK_LAYERS)[valid_mask])
        SHARE_LAYER = 'Dense_7/Dense_7/Relu:0'
        grad_mat = np.zeros((len(TASK_LAYERS)+1, self.n_features))

        for i in range(len(TASK_LAYERS)):
            grad_mat[i,:] = calculate_gradients(dataset.X, TASK_LAYERS[i], prev_model)
        grad_mat[len(TASK_LAYERS),:] = calculate_gradients(dataset.X, SHARE_LAYER, prev_model)
        self.task_df = pd.DataFrame(grad_mat.T)
        self.task_df.columns = list(self.tasks[valid_mask]) + ['SHARE']

        return
        
    def generate_viz_results(self, train_loader, train_df, output_prefix,
                             custom_loader = None, custom_df = None, prev_model = None):
        self.load_model(self, prev_model)

        # get the actual task list from log files
        test_log_df = pd.read_csv(self.save_path + 'test_log.csv')
        self.tasks = test_log_df.columns.values

        if self.para_dict['valid_cutoff'] is not None:
            final_merit = test_log_df.iloc[-1,].values
            valid_mask = final_merit > valid_cutoff
        else:
            valid_mask = np.array([True] * self.n_tasks)

        print('------------- Prepare information for chemicals ------------------')
        # calculate transfer values and coordinates
        model_transfer = self.get_transfer_model(n_layer = self.para_dict['n_layer'])
        coord_values1, coord_values2 = self.get_coords(model_transfer, train_loader, custom_loader)

        # prediction for the training set
        self.compound_df1 = generate_compound_df(train_loader, train_df, coord_values1, valid_mask)
        if not custom_loader is None:
            self.compound_df2 = generate_compound_df(custom_loader, custom_df, coord_values2, valid_mask)

        print('------------- Prepare information for minibatches ------------------')
        # clustering
        self.generate_batch_df(train_loader, custom_loader, coord_values1, coord_values2)

        print('------------- Prepare information for tasks ------------------')
        # derivative/gradient/sensitivity calculation
        self.generate_task_df(train_loader, prev_model, valid_mask)

        print('------- Generate color labels with default K of 5 --------')
        # color mapping
        batch_df, task_df, compound_df = update_bicluster(self.batch_df, self.task_df, self.compound_df1, mode = 'RobustMT', K = 5)
        if not custom_loader is None:
            lut2 = dict(zip(batch_df['Label_id'], batch_df['batch_label_color']))
            lut22 = dict(zip(batch_df['Label_id'], batch_df['batch_label']))
            lut222 = dict(zip(compound_df['label'], compound_df['label_color']))
            compound_df2['batch_label_color'] = self.compound_df2['label'].map(lut2)
            compound_df2['batch_label'] = self.compound_df2['label'].map(lut22)
            compound_df2['label_color'] = self.compound_df2['label'].map(lut222)

        print('-------------- Saving datasets ----------------')
        # saving results
        compound_df.to_csv(self.save_path + output_prefix + 'compound_df.csv', index = False)
        batch_df.to_csv(self.save_path + output_prefix + 'batch_df.csv', index = False)
        task_df.to_csv(self.save_path + output_prefix + 'task_df.csv', index = False)

        if not custom_loader is None:
            compound_df2.to_csv(output_prefix + 'compound_custom_df.csv', index = False)
        
        return


In [17]:
print(help(deepchem_robust_regressor))

Help on class deepchem_robust_regressor in module __main__:

class deepchem_robust_regressor(visar.VISAR_model.visar_model)
 |  Method resolution order:
 |      deepchem_robust_regressor
 |      visar.VISAR_model.visar_model
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, para_dict, *args, **kwargs)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  calculate_gradients(self, X_train, task_tensor_name, prev_model)
 |      Calculate the gradients for each chemical
 |      input: X_train --- fingerprint matrix of the chemicals of interest
 |             prev_model -- trained neural network model
 |      output: the gradient matrix
 |  
 |  evaluate(self, data_loader)
 |  
 |  fit(self, dataset)
 |  
 |  generate_task_df(self, dataset, prev_model, valid_mask)
 |  
 |  generate_viz_results(self, train_loader, train_df, output_prefix, custom_loader=None, custom_df=None, prev_model=None)
 |  
 |  get_coords(self, transfer_model, trai

In [43]:
robust_model = deepchem_robust_regressor(para_dict_DC_robustMT)

In [44]:
robust_model.model_init()

In [34]:
print(robust_model.__repr__())

RobustMultitaskRegressor(activation_fns=None, bias_init_consts=None,
                         bypass_bias_init_consts=None, bypass_dropouts=None,
                         bypass_layer_sizes=None,
                         bypass_weight_init_stddevs=None, dropouts=None,
                         layer_sizes=None, n_features=2048, n_tasks=2,
                         weight_decay_penalty=None,
                         weight_decay_penalty_type=None,
                         weight_init_stddevs=None)
None


In [45]:
y_pred = robust_model.predict(test_loader)

In [46]:
robust_model.fit(train_loader, test_loader)

Evaluating model
computed_metrics: [0.9428589651853301, 0.9419415473750377]
Training R2 score: 0.942400
computed_metrics: [0.4240126613218471, 0.5108456035531777]
Test R2 score: 0.467429


KeyboardInterrupt: 

In [49]:
prev_model = robust_model.model.model_dir + '/ckpt-65'

In [50]:
robust_model.load_model(prev_model)

In [52]:
robust_model.model.

{'activation_fns': None,
 'bias_init_consts': None,
 'bypass_bias_init_consts': None,
 'bypass_dropouts': None,
 'bypass_layer_sizes': None,
 'bypass_weight_init_stddevs': None,
 'dropouts': None,
 'layer_sizes': None,
 'n_features': 2048,
 'n_tasks': 2,
 'weight_decay_penalty': None,
 'weight_decay_penalty_type': None,
 'weight_init_stddevs': None}

In [41]:
import tensorflow as tf
tf.test.is_gpu_available()

False