## Deepchem DataLoader

In [None]:
from visar.deepchem_utils import prepare_dataset

In [None]:
para_dict_visar = {
    'model_name': 'baseline_reg',
    'task_list': ['T107'],
    'eval_type': 'regression',
    # input data related params:
    'dataset_file': './data/MT_data_clean_June28.csv',
    'feature_type': 'Circular_2048',
    'id_field': 'molregno',
    'smiles_field': 'salt_removed_smi',
    'model_flag': 'MT',
    'add_features': None,
    'frac_train': 0.8,
    'rand_seed': 0,
    # model architecture related parameters:
    'baseline_type': 'RidgeCV'
}

In [None]:
train_loader, test_loader, train_df, test_df = prepare_dataset(para_dict_visar)

In [None]:
train_loader.X.shape, train_loader.y.shape, train_df.shape

## baseline class

In [None]:
from visar.visar_utils import update_bicluster
from visar.VISAR_model import visar_model

In [None]:
baseline_model = visar_model(para_dict_visar)
baseline_model.model_init()
baseline_model.fit(train_loader)

In [None]:
print(baseline_model.model.coef_[0:10])

In [None]:
baseline_model.generate_viz_results(train_loader, train_df, '/ridgeCV_')

In [None]:
baseline_model.generate_task_df()

In [None]:
SMILES_test = train_df[para_dict_visar['smiles_field']].iloc[0]
mol, _, _, atoms_contrib = baseline_model.generate_instance_analysis(smiles_string = SMILES_test)

In [None]:
mol

## deepchem class

In [None]:
from visar.deepchem_regressor import deepchem_robust_regressor

In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES']='1'

para_dict_DC_robustMT = {
    'model_name': 'DC_RobustMT_reg',
    'task_list': ['T107', 'T108'],
    'eval_type': 'regression',
    # input data related params:
    'dataset_file': './data/MT_data_clean_June28.csv',
    'feature_type': 'Circular_2048',
    'id_field': 'molregno',
    'smiles_field': 'salt_removed_smi',
    'model_flag': 'MT',
    'add_features': None,
    'frac_train': 0.8,
    'rand_seed': 0,
    # model architecture related parameters:
    'layer_sizes': [128, 64],
    'bypass_layer_sizes': [64],
    'dropouts': 0.5,
    'bypass_dropouts': 0.5,
    # model training related parameters:
    'learning_rate': 0.001,
    'GPU': False,
    'epoch': 40, # training epoch of each round (saving model at the end of each round)
    'epoch_num': 2, # how many rounds
    # viz file processing related parameters:
    'model_architecture':['RobustMT'],
    'valid_cutoff': None, 
    'n_layer': 2
}


In [None]:
train_loader, test_loader, train_df, test_df = prepare_dataset(para_dict_DC_robustMT)

In [None]:
robust_model = deepchem_robust_regressor(para_dict_DC_robustMT)

In [None]:
robust_model.model_init()

In [None]:
print(robust_model.__repr__())

In [None]:
robust_model.fit(train_loader, test_loader, restore_flag = False)

In [None]:
prev_model = robust_model.save_path + '/model-1120'
robust_model.generate_viz_results(train_loader, train_df, 
                                  output_prefix = 'Robust_test', 
                                  prev_model = prev_model)

In [None]:
prev_model = robust_model.save_path + '/model-1120'
robust_model.load_model(prev_model)
robust_model.generate_task_df(train_loader, prev_model)

In [None]:
SMILES_test = train_df[para_dict_DC_robustMT['smiles_field']].iloc[0]
mol, _, _, atoms_contrib = robust_model.generate_instance_analysis(smiles_string = SMILES_test)

In [None]:
atoms_contrib['T108']

## deepchem multitask regressor

In [1]:
import os
from visar.deepchem_utils import prepare_dataset
import deepchem as dc
from visar.deepchem_regressor import deepchem_regressor
import tensorflow as tf
os.environ['CUDA_VISIBLE_DEVICES']='1'

para_dict_DC_MT = {
    'model_name': 'DC_MT_reg',
    'task_list': ['T107','T108'],
    'eval_type': 'regression',
    # input data related params:
    'dataset_file': './data/MT_data_clean_June28.csv',
    'feature_type': 'Circular_2048',
    'id_field': 'molregno',
    'smiles_field': 'salt_removed_smi',
    'model_flag': 'MT',
    'add_features': None,
    'frac_train': 0.9,
    'rand_seed': 0,
    # model architecture related parameters:
    'layer_sizes': [128, 64],
    'dropouts': 0.5,
    # model training related parameters:
    'learning_rate': 0.001,
    'GPU': False,
    'epoch': 40, # training epoch of each round (saving model at the end of each round)
    'epoch_num': 2, # how many rounds
    # viz file processing related parameters:
    'model_architecture':'ST',
    'valid_cutoff': None, 
    'n_layer': 2
}

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
train_loader, test_loader, train_df, test_df = prepare_dataset(para_dict_DC_MT)

  if (await self.run_code(code, result,  async_=asy)):


Extracted dataset shape: (3471, 4)
Loading raw samples now.
shard_size: 8192
About to start loading CSV from tmp.csv
Loading shard 1 of size 8192.
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 2000
Featurizing sample 3000
TIMING: featurizing shard 0 took 19.166 s
TIMING: dataset construction took 19.380 s
Loading dataset from disk.
Computing train/valid/test indices
TIMING: dataset construction took 0.527 s
Loading dataset from disk.
TIMING: dataset construction took 0.214 s
Loading dataset from disk.


In [3]:
mt_model = deepchem_regressor(para_dict_DC_MT)
mt_model.model_init()
mt_model.__repr__()

MultitaskRegressor(activation_fns=None, bias_init_consts=None, dropouts=None,
                   layer_sizes=None, n_features=2048, n_tasks=2,
                   uncertainty=None, weight_decay_penalty=None,
                   weight_decay_penalty_type=None, weight_init_stddevs=None)


In [None]:
mt_model.fit(train_loader, test_loader, restore_flag = True)

In [None]:
prev_model = mt_model.save_path + '/model-1120'
mt_model.generate_viz_results(train_loader, train_df, 
                              output_prefix = 'MT_test', 
                              prev_model = prev_model)

In [4]:
SMILES_test = train_df[para_dict_DC_MT['smiles_field']].iloc[0]
prev_model = mt_model.save_path + '/model-1120'
mt_model.load_model(prev_model)
mt_model.generate_task_df(train_loader, prev_model)
mol, _, _, atoms_contrib = mt_model.generate_instance_analysis(smiles_string = SMILES_test)

INFO:tensorflow:Restoring parameters from /home/dqy14/workplace/VISAR_workplace/logs/DC_MT_reg/model/model-1120
INFO:tensorflow:Restoring parameters from /home/dqy14/workplace/VISAR_workplace/logs/DC_MT_reg/model/model-1120


In [5]:
atoms_contrib

{'SHARE': array([[-0.57198474],
        [-0.37418113],
        [-1.080391  ],
        [-1.2359043 ],
        [-0.18426663],
        [ 0.68189861],
        [ 0.46629221],
        [-0.20759526],
        [ 0.74353109],
        [ 1.0035504 ],
        [ 1.05621474],
        [ 0.48930795],
        [ 0.81215509],
        [ 0.96399868],
        [ 0.72784745],
        [ 1.24009725]])}

In [None]:
import tensorflow as tf
tf.test.is_gpu_available()

In [None]:
print(help(deepchem_robust_regressor))