# Distributed hyper-parameter optimization example for the RPV GAN

This notebook shows how to distribute the GAN training tasks for hyper-parameter sets on Cori using IPyParallel.

In [1]:
# Convenient fudge for python path
import sys
sys.path.append('..')

In [2]:
# System imports
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import
import os

# External imports
import ipyparallel as ipp
import numpy as np
#import matplotlib.pyplot as plt

# Local imports
#from atlasgan.dataset import RPVImages
#from atlasgan.trainers import DCGANTrainer

In [3]:
%%bash
squeue -u sfarrell

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
          13553035 interacti       sh sfarrell  R       0:52      2 nid000[11-12]


In [4]:
# Cluster ID taken from job ID above
job_id = 13553035
cluster_id = 'cori_{}'.format(job_id)

# Use default profile
c = ipp.Client(timeout=60, cluster_id=cluster_id)
print('Worker IDs:', c.ids)

Worker IDs: [0, 1]


## Configure the hyperparameter search tasks

In [5]:
# Data config
n_train = 1024
input_file = '/global/cscratch1/sd/sfarrell/atlas_gan/data_split/RPV10_1400_850_01_train.npz'
output_dir_base = '/global/cscratch1/sd/sfarrell/atlas_gan/AtlasDCGAN_notebook'

os.makedirs(output_dir_base, exist_ok=True)

In [6]:
# Model config
n_hpo_trials = 2
noise_dim = np.random.choice([16, 32, 64, 128], size=n_hpo_trials)
n_filters = np.random.choice([8, 16, 32, 64, 128], size=n_hpo_trials)
lr = np.random.choice([1e-5, 1e-4, 2e-4, 1e-3, 5e-3], size=n_hpo_trials)
flip_rate = np.random.uniform(0, 0.2, size=n_hpo_trials)

In [7]:
# Training config
batch_size = 64
n_epochs = 8

## Run the hyperparameter search

In [8]:
def build_and_train(input_file, output_dir, n_train,
                    noise_dim, n_filters, lr, flip_rate,
                    batch_size, n_epochs):
    # Convenient fudge for python path
    import sys
    sys.path.append('..')
    import os
    import logging
    from torch.utils.data import DataLoader
    from atlasgan.dataset import RPVImages
    from atlasgan.trainers import DCGANTrainer

    os.makedirs(output_dir, exist_ok=True)
    log_format = '%(asctime)s %(levelname)s %(message)s'
    logging.basicConfig(level=logging.INFO, format=log_format)
    
    # Set up the data loader
    scale = 4e6
    dataset = RPVImages(input_file, n_samples=n_train, scale=scale)
    data_loader = DataLoader(dataset, batch_size=batch_size)
    #print('Loaded data with shape: %s' % str(dataset.data.size()))
    # Instantiate the trainer
    trainer = DCGANTrainer(noise_dim=int(noise_dim), n_filters=int(n_filters),
                           lr=lr, flip_rate=flip_rate, 
                           threshold=500./scale, image_norm=scale,
                           output_dir=output_dir, cuda=False)
    # Run the training
    trainer.train(data_loader, n_epochs=n_epochs, n_save=8)

In [9]:
# Load-balanced view
lv = c.load_balanced_view()

# Loop over hyper-parameter sets
results = []
for ihp in range(n_hpo_trials):
    print('Hyperparameter trial %i noise %i filters %i flip %.3f lr %.4f' %
          (ihp, noise_dim[ihp], n_filters[ihp], flip_rate[ihp], lr[ihp]))
    output_dir = os.path.join(output_dir_base, 'hp_%i' % ihp)
    result = lv.apply(build_and_train,
                      input_file=input_file, output_dir=output_dir, n_train=n_train,
                      noise_dim=noise_dim[ihp], n_filters=n_filters[ihp], lr=lr[ihp],
                      flip_rate=flip_rate[ihp], batch_size=batch_size, n_epochs=n_epochs)
    results.append(result)

Hyperparameter trial 0 noise 32 filters 128 flip 0.193 lr 0.0010
Hyperparameter trial 1 noise 64 filters 32 flip 0.013 lr 0.0050


In [21]:
print('Tasks completed: %i / %i' % (np.sum([ar.ready() for ar in results]), len(results)))

Tasks completed: 2 / 2


In [22]:
results

[<AsyncResult: build_and_train:finished>,
 <AsyncResult: build_and_train:finished>]

In [23]:
ar = results[0]
print(ar.stdout)

Loaded data with shape: torch.Size([1024, 1, 64, 64])



In [24]:
ar.metadata

{'msg_id': '165b86a4-bb886678a23930a529f8df20',
 'submitted': datetime.datetime(2018, 7, 8, 15, 48, 31, 589789, tzinfo=tzutc()),
 'started': datetime.datetime(2018, 7, 8, 15, 48, 31, 595909, tzinfo=tzutc()),
 'completed': datetime.datetime(2018, 7, 8, 16, 10, 25, 869777, tzinfo=tzutc()),
 'received': datetime.datetime(2018, 7, 8, 16, 10, 25, 873703, tzinfo=tzutc()),
 'engine_uuid': '33154006-e458055573ef4fa6f24b1898',
 'engine_id': 1,
 'follow': [],
 'after': [],
 'status': 'ok',
 'execute_input': None,
 'execute_result': None,
 'error': None,
 'stdout': 'Loaded data with shape: torch.Size([1024, 1, 64, 64])\n',
 'stderr': '',
 'outputs': [],
 'data': {}}

In [25]:
ar.display_outputs()

Loaded data with shape: torch.Size([1024, 1, 64, 64])


## Discussion

This notebook example isn't doing much at the moment. In the HP tasks we're writing checkpoints for the model at every epoch, so all the information is there for later analysis.

Coming soon I'll add some validation directly to the task so we can get an answer right away about the quality of the models.