## Install deepchem

### Colab

In [None]:
!curl -Lo conda_installer.py https://raw.githubusercontent.com/deepchem/deepchem/master/scripts/colab_install.py
import conda_installer
conda_installer.install()
!/root/miniconda/bin/conda info -e

In [None]:
!pip install --pre deepchem
import deepchem as dc
dc.__version__

In [None]:
import tensorflow as tf


device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

tf.__version__

## Initialize

In [None]:
import itertools
import pandas as pd
from dataclasses import dataclass
from typing import Union, List, Tuple, NoReturn, Optional
import random
import os

import deepchem as dc
from tqdm import tqdm

ROOT_DIR = "/content/drive/My Drive/Colab Notebooks/deep learning for the life sciences/hiv"
DATA_DIR = os.path.join(ROOT_DIR, "data")

In [None]:
%env DEEPCHEM_DATA_DIR=/content/drive/My Drive/Colab Notebooks/deep learning for the life sciences/hiv/data

In [None]:
import os
os.environ['DEEPCHEM_DATA_DIR']

## Load the data

### Cached data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from pathlib import Path
import os

print(os.listdir(DATA_DIR))
print(os.listdir('/content/drive/My Drive/Colab Notebooks/deep learning for the life sciences'))

In [None]:
# for unmounting
# from google.colab import drive
# drive.flush_and_unmount()

In [None]:
data = pd.read_csv(Path(DATA_DIR) / "HIV.csv")

In [None]:
data.shape

In [None]:
data["HIV_active"] == 1

In [None]:
data.head()

In [None]:
data[data["HIV_active"] == 1]

In [None]:
hiv_tasks, hiv_datasets, transformers = dc.molnet.load_hiv(
    featurizer="ECFP", split="random", data_dir=DATA_DIR, save_dir=os.path.join(DATA_DIR, "from-hiv"), split_seed=100, reload=True
)
train_dataset, valid_dataset, test_dataset = hiv_datasets

## Preprocessing

In [None]:
training_df = train_dataset.to_dataframe()
training_df.head()

In [None]:
training_df[training_df["y"] == 1]

## Training

### Hyperparameters

In [None]:
layers_list = [
    [125, 62],
    # [250, 125],
    # [500, 250],
    # [1000, 500],
    # [2000, 1000],
    # [4000, 2000],
    # [2000, 1000, 500, 250],
]

dropouts_list = [0.8]

seeds = [0]

hyperparameters = list(itertools.product(layers_list, dropouts_list, seeds))

(layers_list, dropouts_list, seeds)

In [None]:
nn_models = []

In [None]:
@dataclass
class OutputModel:
    model: Union[dc.models.Model, dc.models.sklearn_models.SklearnModel]
    layers: Optional[List[int]]
    dropout: Optional[Union[float, List[float]]]
    seed: int
    train_score: Optional[float] = None
    test_score: Optional[float] = None

In [None]:
def create_nn_model(
    layer_sizes: List[int], dropouts: Union[float, List[float]], learning_rate: float = 0.0003
) -> dc.models.Model:
    n_features = train_dataset.X.shape[1]
    model = dc.models.MultitaskClassifier(
        n_tasks=len(hiv_tasks),
        n_features=n_features,
        layer_sizes=layer_sizes,
        dropouts=dropouts,
        learning_rate=learning_rate,
        model_dir=f"{ROOT_DIR}/model_dumps/hiv_nn__{'-'.join(map(str, layer_sizes))}_dropout_{dropouts}_learningrate_{learning_rate}",
    )
    return model

In [None]:
for i in tqdm(range(len(hyperparameters))):
    layers, dropout, seed = hyperparameters[i]
    model_nn = create_nn_model(layer_sizes=layers, dropouts=dropout)
    model_nn.fit(train_dataset, nb_epoch=50)
    nn_models.append(
        OutputModel(
            model=model_nn,
            layers=layers,
            dropout=dropout,
            seed=seed,
        )
    )

## Evaulating

In [None]:
transformers

In [None]:
len(nn_models)

In [None]:
nn_models[0].model.model.summary()