In [1]:
# import the required modules

import numpy as np
np.random.seed(456)
import matplotlib.pyplot as plt
import deepchem as dc
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier



In [2]:
# Train and test the Tox21 dataset using the Random Forest algorithm in scikit-learn

_, (train, valid, test), _ = dc.molnet.load_tox21()
train_X, train_y, train_w = train.X, train.y, train.w
valid_X, valid_y, valid_w = valid.X, valid.y, valid.w
test_X, test_y, test_w = test.X, test.y, test.w

# Remove extra tasks
train_y = train_y[:, 0]
valid_y = valid_y[:, 0]
test_y = test_y[:, 0]
train_w = train_w[:, 0]
valid_w = valid_w[:, 0]
test_w = test_w[:, 0]

# Train the model
sklearn_model = RandomForestClassifier(
    class_weight="balanced", n_estimators=50)
print("About to fit model on train set.")
sklearn_model.fit(train_X, train_y)

train_y_pred = sklearn_model.predict(train_X)
valid_y_pred = sklearn_model.predict(valid_X)
test_y_pred = sklearn_model.predict(test_X)

# print accuracy scores
weighted_score = accuracy_score(train_y, train_y_pred, sample_weight=train_w)
print("Weighted train Classification Accuracy: %f" % weighted_score)
weighted_score = accuracy_score(valid_y, valid_y_pred, sample_weight=valid_w)
print("Weighted valid Classification Accuracy: %f" % weighted_score)
weighted_score = accuracy_score(test_y, test_y_pred, sample_weight=test_w)
print("Weighted test Classification Accuracy: %f" % weighted_score)

Loading raw samples now.
shard_size: 8192
About to start loading CSV from /var/folders/dw/zv841rqs40l1wq1t4932b4sh0000gn/T/tox21.csv.gz
Loading shard 1 of size 8192.
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 2000
Featurizing sample 3000
Featurizing sample 4000
Featurizing sample 5000
Featurizing sample 6000
Featurizing sample 7000
TIMING: featurizing shard 0 took 18.120 s
TIMING: dataset construction took 19.136 s
Loading dataset from disk.
TIMING: dataset construction took 2.565 s
Loading dataset from disk.
TIMING: dataset construction took 1.527 s
Loading dataset from disk.
TIMING: dataset construction took 0.759 s
Loading dataset from disk.
TIMING: dataset construction took 0.752 s
Loading dataset from disk.
About to fit model on train set.
Weighted train Classification Accuracy: 0.996769
Weighted valid Classification Accuracy: 0.680170
Weighted test Classification Accuracy: 0.650164


In [3]:
# import function to perform a grid search

from fcnet_func import eval_tox21_hyperparams

In [4]:
''' 
set up a grid search to tune hyperparameters for the Neural Network

Model hyperparameters:

no. of neurons per hidden layers: hidden_sizes

no. of hidden layers: num_layers

Learning rate

no. of epochs

Batch size

Dropout probability

'''

scores = {}
n_reps = 1
hidden_sizes = [5]
epochs = [1]
dropouts = [.5, 1.0]
num_layers = [1, 2]

for rep in range(n_reps):
  for n_epochs in epochs:
    for hidden_size in hidden_sizes:
      for dropout in dropouts:
        for n_layers in num_layers:
          score = eval_tox21_hyperparams(n_hidden=hidden_size, n_epochs=n_epochs,
                                         dropout_prob=dropout, n_layers=n_layers)
          if (hidden_size, n_epochs, dropout, n_layers) not in scores:
            scores[(hidden_size, n_epochs, dropout, n_layers)] = []
          scores[(hidden_size, n_epochs, dropout, n_layers)].append(score)
print("All Scores")
print(scores)



---------------------------------------------
Model hyperparameters
n_hidden = 5
n_layers = 1
learning_rate = 0.001000
n_epochs = 1
batch_size = 100
weight_positives = True
dropout_prob = 0.500000
---------------------------------------------
Loading dataset from disk.
Loading dataset from disk.
Loading dataset from disk.
epoch 0, step 0, loss: 446.149902
epoch 0, step 1, loss: 234.444077
epoch 0, step 2, loss: 216.360336
epoch 0, step 3, loss: 370.723145
epoch 0, step 4, loss: 493.688934
epoch 0, step 5, loss: 241.417664
epoch 0, step 6, loss: 251.523376
epoch 0, step 7, loss: 261.556946
epoch 0, step 8, loss: 852.065735
epoch 0, step 9, loss: 946.692871
epoch 0, step 10, loss: 185.214966
epoch 0, step 11, loss: 241.288727
epoch 0, step 12, loss: 1265.515747
epoch 0, step 13, loss: 835.334656
epoch 0, step 14, loss: 974.542908
epoch 0, step 15, loss: 322.370636
epoch 0, step 16, loss: 315.324768
epoch 0, step 17, loss: 321.011414
epoch 0, step 18, loss: 644.870850
epoch 0, step 19, lo

In [None]:
eval_tox21_hyperparams(n_hidden=hidden_size, n_epochs=n_epochs,
                                         dropout_prob=dropout, n_layers=n_layers)

In [29]:
scores.items()

dict_items([((5, 1, 0.5, 1), [0.5490761141892654]), ((5, 1, 0.5, 2), [0.6320999671539234]), ((5, 1, 1.0, 1), [0.47306501844894594]), ((5, 1, 1.0, 2), [0.4350043275305265])])

In [5]:
avg_scores = {}
for params, param_scores in scores.items():
  avg_scores[params] = np.mean(np.array(param_scores))
print("Scores Averaged over %d repetitions" % n_reps)
print(avg_scores)

Scores Averaged over 1 repetitions
{(5, 1, 0.5, 1): 0.5490761141892654, (5, 1, 0.5, 2): 0.6320999671539234, (5, 1, 1.0, 1): 0.47306501844894594, (5, 1, 1.0, 2): 0.4350043275305265}
