## Install deepchem

### Colab

In [1]:
!curl -Lo conda_installer.py https://raw.githubusercontent.com/deepchem/deepchem/master/scripts/colab_install.py
import conda_installer
conda_installer.install()
!/root/miniconda/bin/conda info -e

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  3490  100  3490    0     0  13960      0 --:--:-- --:--:-- --:--:-- 13960


add /root/miniconda/lib/python3.6/site-packages to PYTHONPATH
python version: 3.6.9
fetching installer from https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
done
installing miniconda to /root/miniconda
done
installing rdkit, openmm, pdbfixer
added omnia to channels
added conda-forge to channels
done
conda packages installation finished!


# conda environments:
#
base                  *  /root/miniconda



In [2]:
!pip install --pre deepchem-nightly==2.3.0
import deepchem
deepchem.__version__

Collecting deepchem-nightly==2.3.0
[?25l  Downloading https://files.pythonhosted.org/packages/06/86/e83fe3a6c6e93643269cbd9fc28861b147ba9286ee227ed775834cd01e59/deepchem-nightly-2.3.0.tar.gz (308kB)
[K     |█                               | 10kB 15.6MB/s eta 0:00:01[K     |██▏                             | 20kB 4.6MB/s eta 0:00:01[K     |███▏                            | 30kB 5.8MB/s eta 0:00:01[K     |████▎                           | 40kB 6.0MB/s eta 0:00:01[K     |█████▎                          | 51kB 5.1MB/s eta 0:00:01[K     |██████▍                         | 61kB 5.4MB/s eta 0:00:01[K     |███████▌                        | 71kB 5.9MB/s eta 0:00:01[K     |████████▌                       | 81kB 6.5MB/s eta 0:00:01[K     |█████████▋                      | 92kB 7.0MB/s eta 0:00:01[K     |██████████▋                     | 102kB 7.0MB/s eta 0:00:01[K     |███████████▊                    | 112kB 7.0MB/s eta 0:00:01[K     |████████████▊                   | 122kB

'2.3.0'

In [3]:
import tensorflow


device_name = tensorflow.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

tensorflow.__version__

Found GPU at: /device:GPU:0


'2.3.0'

### Locally

Follow instructions here to setup your conda env with deepchem: https://deepchem.readthedocs.io/en/latest/installation.html#conda-installation

Additionally install requirements:

pip install -r requirements.txt

## Initialize

In [4]:
import itertools
from dataclasses import dataclass
from typing import Union, List, Tuple, NoReturn, Optional
import random
import os

import deepchem as dc
from deepchem.models import KerasModel
from tqdm import tqdm

ROOT_DIR = "/content/drive/My Drive/Colab Notebooks/deep learning for the life sciences/chapter_5_protein_binding"
DATA_DIR = os.path.join(ROOT_DIR, "data")

In [5]:
%env DEEPCHEM_DATA_DIR=/content/drive/My Drive/Colab Notebooks/deep learning for the life sciences/chapter_5_protein_binding/data

env: DEEPCHEM_DATA_DIR=/content/drive/My Drive/Colab Notebooks/deep learning for the life sciences/chapter_5_protein_binding/data


In [6]:
%env

{'CLICOLOR': '1',
 'CLOUDSDK_CONFIG': '/content/.config',
 'CLOUDSDK_PYTHON': 'python3',
 'COLAB_GPU': '1',
 'CUDA_PKG_VERSION': '10-1=10.1.243-1',
 'CUDA_VERSION': '10.1.243',
 'CUDNN_VERSION': '7.6.5.32',
 'DATALAB_SETTINGS_OVERRIDES': '{"kernelManagerProxyPort":6000,"kernelManagerProxyHost":"172.28.0.3","jupyterArgs":["--ip=\\"172.28.0.2\\""]}',
 'DEBIAN_FRONTEND': 'noninteractive',
 'DEEPCHEM_DATA_DIR': '/content/drive/My Drive/Colab Notebooks/deep learning for the life sciences/chapter_5_protein_binding/data',
 'ENV': '/root/.bashrc',
 'GCE_METADATA_TIMEOUT': '0',
 'GCS_READ_CACHE_BLOCK_SIZE_MB': '16',
 'GIT_PAGER': 'cat',
 'GLIBCPP_FORCE_NEW': '1',
 'GLIBCXX_FORCE_NEW': '1',
 'HOME': '/root',
 'HOSTNAME': '3ecb877b3277',
 'JPY_PARENT_PID': '24',
 'KMP_DUPLICATE_LIB_OK': 'True',
 'KMP_INIT_AT_FORK': 'FALSE',
 'LANG': 'en_US.UTF-8',
 'LAST_FORCED_REBUILD': '20200910',
 'LD_LIBRARY_PATH': '/usr/lib64-nvidia',
 'LD_PRELOAD': '/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4',
 'LIBRARY_PAT

In [7]:
import os
os.environ['DEEPCHEM_DATA_DIR']

'/content/drive/My Drive/Colab Notebooks/deep learning for the life sciences/chapter_5_protein_binding/data'

## Load the data

### Cached data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from pathlib import Path
import os

print(os.listdir(DATA_DIR))
print(os.listdir('/content/drive'))

In [None]:
# for unmounting
# from google.colab import drive
# drive.flush_and_unmount()

In [None]:
pdbbind_tasks, pdbbind_datasets, transformers = dc.molnet.load_pdbbind(
    featurizer="grid", split="random", subset="core", data_dir=DATA_DIR, save_dir=os.path.join(DATA_DIR, "from-pdbbind"), split_seed=100, reload=True
)
train_dataset, valid_dataset, test_dataset = pdbbind_datasets

In [None]:
@dataclass
class OutputModel:
    model: dc.models.MultitaskRegressor
    layers: List[int]
    dropout: Union[float, List[float]]
    seed: int
    train_score: Optional[float]
    test_score: Optional[float]


def create_model(
    layer_sizes: List[int], dropouts: Union[float, List[float]], learning_rate: float = 0.0003
) -> dc.models.MultitaskRegressor:
    n_features = train_dataset.X.shape[1]
    model = dc.models.MultitaskRegressor(
        n_tasks=len(pdbbind_tasks),
        n_features=n_features,
        layer_sizes=layer_sizes,
        dropouts=dropouts,
        learning_rate=learning_rate,
        model_dir=f"{ROOT_DIR}/model_dumps/pdbbind_nn__{'-'.join(map(str, layer_sizes))}_dropout_{dropouts}_learningrate_{learning_rate}",
    )
    return model


def evaluate_model(model: dc.models.MultitaskRegressor) -> Tuple[float, float]:
    metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)
    train_scores = model.evaluate(train_dataset, [metric], transformers)["pearson_r2_score"]
    test_scores = model.evaluate(test_dataset, [metric], transformers)["pearson_r2_score"]
    return train_scores, test_scores


def visualize_model_output(model: OutputModel) -> NoReturn:
    print(
        f"""############################################################
Visualizing {model.model.__class__.__name__} model
  Training score: {model.train_score}
  Test score: {model.test_score}
  Hyperparameters: 
    Layers: {model.layers} 
    Dropout: {model.dropout}
    Seed: {model.seed}

Model summary:
""")
    model.model.model.summary()
    print("############################################################")

In [None]:
layers_list = [
    # [125, 62],
    # [250, 125],
    # [500, 250],
    # [1000, 500],
    # [2000, 1000],
    # [4000, 2000],
    [2000, 1000, 500, 250],
]

dropouts_list = [0.2, 0.5, 0.8]

seeds = [0, 10, 100, 1000]

hyperparameters = list(itertools.product(layers_list, dropouts_list, seeds))

(layers_list, dropouts_list, seeds)

In [None]:
nn_models = []

In [None]:
for i in tqdm(range(len(hyperparameters))):
    if i < len(models):
        continue
    layers, dropout, seed = hyperparameters[i]
    random.seed(seed)
    model = create_model(layer_sizes=layers, dropouts=dropout)
    model.fit(train_dataset, nb_epoch=50)
    train_score, test_score = evaluate_model(model=model)
    models.append(
        OutputModel(
            model=model,
            layers=layers,
            dropout=dropout,
            seed=seed,
            train_score=train_score,
            test_score=test_score,
        )
    )

In [None]:
for model in models:
    visualize_model_output(model)