## DeepChem Intro

In [20]:
!pip install --pre deepchem
##!pip install --pre deepchem[torch]



In [21]:
import torch
import deepchem as dc
import numpy as np

In [22]:

dc.__version__

'2.6.1'

In [23]:
x = np.random.random((4, 5))
y = np.random.random((4, 1))

In [24]:
x

array([[0.08622397, 0.38120865, 0.05747635, 0.60519003, 0.95423576],
       [0.1995458 , 0.57120083, 0.38771397, 0.00647331, 0.30784863],
       [0.78439844, 0.90280702, 0.69727107, 0.33270014, 0.8657816 ],
       [0.58786339, 0.68613304, 0.23674202, 0.64928248, 0.09668482]])

In [25]:
y

array([[0.50134331],
       [0.63309814],
       [0.16663672],
       [0.61268194]])

## Wrap in arrays in NumpyDataset

In [26]:
dataset = dc.data.NumpyDataset(x, y)

In [27]:
print(dataset.X)

[[0.08622397 0.38120865 0.05747635 0.60519003 0.95423576]
 [0.1995458  0.57120083 0.38771397 0.00647331 0.30784863]
 [0.78439844 0.90280702 0.69727107 0.33270014 0.8657816 ]
 [0.58786339 0.68613304 0.23674202 0.64928248 0.09668482]]


In [28]:
print(dataset.y)

[[0.50134331]
 [0.63309814]
 [0.16663672]
 [0.61268194]]


## Toxicity dataset in Molnet

In [29]:
tox21_tasks, tox21_datasets, transformers = dc.molnet.load_tox21()

## Model to Predict Toxicity of Molecules

In [30]:
tox21_tasks

['NR-AR',
 'NR-AR-LBD',
 'NR-AhR',
 'NR-Aromatase',
 'NR-ER',
 'NR-ER-LBD',
 'NR-PPAR-gamma',
 'SR-ARE',
 'SR-ATAD5',
 'SR-HSE',
 'SR-MMP',
 'SR-p53']

In [31]:
tox21_datasets

(<DiskDataset X.shape: (6264, 1024), y.shape: (6264, 12), w.shape: (6264, 12), task_names: ['NR-AR' 'NR-AR-LBD' 'NR-AhR' ... 'SR-HSE' 'SR-MMP' 'SR-p53']>,
 <DiskDataset X.shape: (783, 1024), y.shape: (783, 12), w.shape: (783, 12), ids: ['N#C[C@@H]1CC(F)(F)CN1C(=O)CNC1CC2CCC(C1)N2c1ncccn1'
  'CN(C)C(=O)NC1(c2ccccc2)CCN(CCC[C@@]2(c3ccc(Cl)c(Cl)c3)CCCN(C(=O)c3ccccc3)C2)CC1'
  'CSc1nnc(C(C)(C)C)c(=O)n1N' ...
  'O=C(O[C@H]1CN2CCC1CC2)N1CCc2ccccc2[C@@H]1c1ccccc1'
  'C#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=CC(=O)CC[C@@H]4[C@H]3C(=C)C[C@@]21CC'
  'NC(=O)C(c1ccccc1)(c1ccccc1)[C@@H]1CCN(CCc2ccc3c(c2)CCO3)C1'], task_names: ['NR-AR' 'NR-AR-LBD' 'NR-AhR' ... 'SR-HSE' 'SR-MMP' 'SR-p53']>,
 <DiskDataset X.shape: (784, 1024), y.shape: (784, 12), w.shape: (784, 12), ids: ['CC1(C)S[C@@H]2[C@H](NC(=O)Cc3ccccc3)C(=O)N2[C@H]1C(=O)O.CC1(C)S[C@@H]2[C@H](NC(=O)Cc3ccccc3)C(=O)N2[C@H]1C(=O)O.c1ccc(CNCCNCc2ccccc2)cc1'
  'CC(C)(c1ccc(Oc2ccc3c(c2)C(=O)OC3=O)cc1)c1ccc(Oc2ccc3c(c2)C(=O)OC3=O)cc1'
  'Cc1cc(C(C)(C)C)c(O)c(C)c1

In [32]:
transformers

[<deepchem.trans.transformers.BalancingTransformer at 0x7f86a0c65130>]

## Split the data

Each sample is a molecule

Some labels are missing and must be ignored

In [33]:
train_dataset, valid_dataset, test_dataset = tox21_datasets

In [34]:
print(train_dataset.X.shape)
print(train_dataset.y.shape)

(6264, 1024)
(6264, 12)


In [35]:
print(valid_dataset.X.shape)
print(valid_dataset.y.shape)

(783, 1024)
(783, 12)


In [36]:
print(test_dataset.X.shape)
print(test_dataset.y.shape)

(784, 1024)
(784, 12)


## Model object

In [37]:
print(dc.models)

<module 'deepchem.models' from '/Users/user/opt/anaconda3/envs/biologyML_py38/lib/python3.8/site-packages/deepchem/models/__init__.py'>


In [38]:

model = dc.models.MultitaskClassifier(n_tasks=12, n_features=1024, layer_sizes =[1000])


In [39]:

model.fit(train_dataset, nb_epoch=10)


0.5014457066853841

In [40]:
metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean)

In [41]:

train_scores = model.evaluate(train_dataset, [metric], transformers)
test_scores  = model.evaluate(test_dataset, [metric], transformers)


In [42]:
print(train_scores)

{'mean-roc_auc_score': 0.9582977329400478}


In [43]:
print(test_scores)

{'mean-roc_auc_score': 0.681798455070547}
