In [84]:
! pip install deepchem



In [85]:
import deepchem as dc
from deepchem.feat.molecule_featurizers import MolGraphConvFeaturizer

In [86]:
feat = MolGraphConvFeaturizer(use_edges=True)

toxcast_tasks, toxcast_datasets, transformers = dc.molnet.load_toxcast(featurizer=feat)
train_dataset, valid_dataset, test_dataset = toxcast_datasets

print(len(train_dataset), len(valid_dataset), len(test_dataset))

6852 857 857


In [87]:
toxcast_tasks_one, toxcast_datasets_one, transformers_one = dc.molnet.load_toxcast(featurizer="GraphConv")
train_dataset_one, valid_dataset_one, test_dataset_one = toxcast_datasets_one

print(len(train_dataset_one), len(valid_dataset_one), len(test_dataset_one))

6861 858 858


In [88]:
! pip install dgl dgllife



In [89]:
from deepchem.models import GATModel, GCNModel, GraphConvModel

gcn_model_one = GraphConvModel(n_tasks=len(toxcast_tasks_one), mode="classification", dropout=0.1, batch_size=32, learning_rate=0.001)
gat_model = GATModel(mode="classification", dropout=0.1, n_tasks=len(toxcast_tasks), learning_rate=0.001, batch_size=32)
gcn_model = GCNModel(n_tasks=len(toxcast_tasks), mode="classification", dropout=0.1, batch_size=32, learning_rate=0.001)

In [90]:
gcn_model.fit(train_dataset, nb_epoch = 50)

0.23065557479858398

In [91]:
import numpy as np

metric = dc.metrics.Metric(dc.metrics.balanced_accuracy_score, np.mean, mode="classification")
train_scores = gcn_model.evaluate(train_dataset, [metric], transformers)
test_scores = gcn_model.evaluate(test_dataset, [metric], transformers)

print(f"Training Balanced Accuracy: {train_scores}:.3f")
print(f"Testing Balanced Accuracy: {test_scores}:.3f")



Training Balanced Accuracy: {'mean-balanced_accuracy_score': 0.814000813041521}:.3f
Testing Balanced Accuracy: {'mean-balanced_accuracy_score': 0.6026351964239486}:.3f


In [92]:
gat_model.fit(train_dataset, nb_epoch=50)

0.2713625144958496

In [93]:
train_scores = gat_model.evaluate(train_dataset, [metric], transformers)
test_scores = gat_model.evaluate(test_dataset, [metric], transformers)

print(f"Training Balanced Accuracy: {train_scores}:.3f")
print(f"Testing Balanced Accuracy: {test_scores}:.3f")



Training Balanced Accuracy: {'mean-balanced_accuracy_score': 0.7741483311084422}:.3f
Testing Balanced Accuracy: {'mean-balanced_accuracy_score': 0.6048645385492057}:.3f




In [94]:
gcn_model_one.fit(train_dataset_one, nb_epoch=50)

0.21082258224487305

In [95]:
train_scores = gcn_model_one.evaluate(train_dataset_one, [metric], transformers_one)
test_scores = gcn_model_one.evaluate(test_dataset_one, [metric], transformers_one)

print(f"Training Balanced Accuracy: {train_scores}")
print(f"Testing Balanced Accuracy: {test_scores}")



Training Balanced Accuracy: {'mean-balanced_accuracy_score': 0.8416308669714819}
Testing Balanced Accuracy: {'mean-balanced_accuracy_score': 0.5863796951999999}




Observations
-----------

## Baseline

We'll use the GCN Model, with MolGraphConvFeaturizer featurization as the baseline estimator, 
hence no model should do worse than it.

- Training Balanced Accuracy: 84.320%

- Testing Balanced Accuracy: 60%

## Preliminary model

Our preliminary model is a GraphConvModel with GraphConvMol featurization

- Training Balanced Accuracy: 81.699%

- Testing Balanced Accuracy: 60.155%

In [96]:
! pip install --upgrade pip



In [97]:
! pip install pyGPGO



In [98]:
## Hyperparameter optimization

# def model_bulder(**model_params):
#     model = GraphConvModel(**model_params, n_tasks=1, mode="classification")
#     return model

import tensorflow as tf
from deepchem.hyper import GaussianProcessHyperparamOpt, GridHyperparamOpt

params = {
        'graph_conv_layers': [[32, 32], [64, 64], [128, 128]],
        'dense_layer_size': [128, 256, 512]
}

optimizer = GridHyperparamOpt(lambda **p: GraphConvModel(n_tasks=len(toxcast_tasks_one), mode="classification", dropout=0.1, batch_size=32, learning_rate=0.001, **p))

best_model, best_hyperparams, all_results =   optimizer.hyperparam_search(params, train_dataset_one, test_dataset_one, metric, transformers_one)

best_hyperparams



{'graph_conv_layers': [128, 128], 'dense_layer_size': 256}

In [99]:
all_results

{'_dense_layer_size_128_graph_conv_layers[32, 32]': 0.5769704687833895,
 '_dense_layer_size_256_graph_conv_layers[32, 32]': 0.5679807425512707,
 '_dense_layer_size_512_graph_conv_layers[32, 32]': 0.5746413935928033,
 '_dense_layer_size_128_graph_conv_layers[64, 64]': 0.5764633685793026,
 '_dense_layer_size_256_graph_conv_layers[64, 64]': 0.5682762394459503,
 '_dense_layer_size_512_graph_conv_layers[64, 64]': 0.5936834390086138,
 '_dense_layer_size_128_graph_conv_layers[128, 128]': 0.591753125941375,
 '_dense_layer_size_256_graph_conv_layers[128, 128]': 0.594404028339408,
 '_dense_layer_size_512_graph_conv_layers[128, 128]': 0.5725283606498852}

In [100]:
final_model = GraphConvModel(n_tasks=len(toxcast_tasks_one), mode="classification", dropout=0.1, batch_size=32, learning_rate=0.001, dense_layer_size=128, graph_conv_layers= [128, 128])
final_model.fit(train_dataset_one)

train_scores = final_model.evaluate(train_dataset_one, [metric], transformers_one)
test_scores = final_model.evaluate(test_dataset_one, [metric], transformers_one)

print(f"Training Balanced Accuracy: {train_scores}")
print(f"Testing Balanced Accuracy: {test_scores}")



Training Balanced Accuracy: {'mean-balanced_accuracy_score': 0.7563852428202532}
Testing Balanced Accuracy: {'mean-balanced_accuracy_score': 0.5834136532368377}


