## Setup an IPyParallel cluster

In [1]:
import ipcluster_magics

In [2]:
job_name = "isc_ihpc_mnist"
nodes = 4
module = "python/3.6-anaconda-4.4"
conda_env = "/global/cscratch1/sd/sfarrell/conda/isc-ihpc"

In [3]:
%ipcluster -m $module -e $conda_env -N $nodes -J $job_name -t 1:00:00

salloc: Pending job allocation 13251549
salloc: job 13251549 queued and waiting for resources
salloc: job 13251549 has been allocated resources
salloc: Granted job allocation 13251549
salloc: Waiting for resource configuration
salloc: Nodes nid00[036,108-110] are ready for job


## Connect a client to the running IPP cluster

In [4]:
# Connect to IPP controller
import time
import ipyparallel as ipp

c = None
wait_time = 5
retries = 3
while retries > 0:
    print("checking ipcontroller...")
    try:
        c = ipp.Client()
        print("ipcontroller is running")
        break
    except Exception as e:
        print(e.args)
        print("ipcontroller is not running yet, waiting {} seconds before retry...".format(wait_time))
        time.sleep(wait_time)
        retries -= 1

wait_time = 10
retries = 3
while c is not None and retries > 0:
    if len(c.ids) == 0:
        print("engines are not registered yet with controller, waiting {} seconds before retry...".format(wait_time))
        time.sleep(wait_time)
        retries -= 1
    elif len(c.ids) < nodes:
        print("not all engines have registered, waiting {} seconds...".format(wait_time))
        time.sleep(wait_time)
    else:
        break

if c is not None:
    lv = c.load_balanced_view()
    dv = c.direct_view()
    print(c.ids)

checking ipcontroller...
Waiting for connection file: /global/u2/s/sfarrell/.ipython/profile_default/security/ipcontroller-client.json
("Connection file '/global/u2/s/sfarrell/.ipython/profile_default/security/ipcontroller-client.json' not found.\nYou have attempted to connect to an IPython Cluster but no Controller could be found.\nPlease double-check your configuration and ensure that a cluster is running.",)
ipcontroller is not running yet, waiting 5 seconds before retry...
checking ipcontroller...
Waiting for connection file: /global/u2/s/sfarrell/.ipython/profile_default/security/ipcontroller-client.json
("Connection file '/global/u2/s/sfarrell/.ipython/profile_default/security/ipcontroller-client.json' not found.\nYou have attempted to connect to an IPython Cluster but no Controller could be found.\nPlease double-check your configuration and ensure that a cluster is running.",)
ipcontroller is not running yet, waiting 5 seconds before retry...
2018-06-20 17:26:30.734 [scheduler] 

In [7]:
c.ids

[0, 1, 2, 3]

## Interactively run multiple parameter sets

In [8]:
import mnist
x_train, y_train, x_test, y_test = mnist.load_data()

print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


x_train shape: (60000, 28, 28, 1)
x_test shape: (10000, 28, 28, 1)
y_train shape: (60000, 10)
y_test shape: (10000, 10)


In [9]:
# Training config
# Hold these parameters constant
import os

checkpoint_dir = '/global/cscratch1/sd/$USER/cori-interactive-dl/mnist-hpo'
n_samples = 1000

fixed_params = {
    "verbose": 0,
    "batch_size": 128,
    "nthreads": 8,
    "n_epochs": 32,
    "valid_frac": 0.17,
#    "checkpoint_file": os.path.join(os.path.expandvars(checkpoint_dir), 'model_single.h5'),
    "x_train": x_train[:n_samples], 
    "y_train": y_train[:n_samples]
}

In [10]:
import numpy as np

n_hpo_trials = 8
grid_h1 = np.random.choice([4, 8, 16, 32, 64], size=n_hpo_trials)
grid_h2 = np.random.choice([4, 8, 16, 32, 64], size=n_hpo_trials)
grid_h3 = np.random.choice([8, 16, 32, 64, 128], size=n_hpo_trials)
grid_dropout = np.random.rand(n_hpo_trials)
grid_optimizer = np.random.choice(['Adadelta', 'Adam', 'Nadam'], size=n_hpo_trials)

In [11]:
import functools as ft
from mlextras import build_and_train
from hpo_widgets import ModelPlot, ParamSpanWidget

run_training = ft.partial(build_and_train, **fixed_params)
plot_metrics = ft.partial(
    ModelPlot,
    y=['loss', 'acc', 'val_loss', 'val_acc'],
    xlim=[0, fixed_params["n_epochs"]],
    xlabel='epochs',
    ylabel='training metrics'
)

hpo_params = dict(
    h1=grid_h1,
    h2=grid_h2,
    h3=grid_h3,
    dropout=grid_dropout,
    optimizer=grid_optimizer
)

psw = ParamSpanWidget(
    run_training, 
    plot_metrics, 
    hpo_params)

psw.submit_computations()

psw

ParamSpanWidget(children=(Output(layout=Layout(border='1px solid', height='600px', overflow_x='scroll', overfl…

In [13]:
psw.debug

Output(layout=Layout(border='1px solid', height='500px', overflow_x='scroll', overflow_y='scroll'), outputs=({…

In [None]:
from IPython.display import display
for m in psw.model_plots:
    display(m.debug)

## Look at additional model details

In [14]:
import pprint
for i in range(len(psw.model_runs)):
    pprint.pprint(psw.model_runs[i].metadata)

{'after': [],
 'completed': datetime.datetime(2018, 6, 21, 0, 31, 50, 180434, tzinfo=tzutc()),
 'data': {'epoch': 31, 'logs': {}, 'status': 'Ended Training'},
 'engine_id': 1,
 'engine_uuid': '11669646-4876e31016d03eea6b1765cb',
 'error': None,
 'execute_input': None,
 'execute_result': None,
 'follow': [],
 'msg_id': '6fe20358-268d85d60ef1a91997ccf6b6',
 'outputs': [],
 'received': datetime.datetime(2018, 6, 21, 0, 31, 50, 670085, tzinfo=tzutc()),
 'started': datetime.datetime(2018, 6, 21, 0, 30, 21, 500093, tzinfo=tzutc()),
 'status': 'ok',
 'stderr': '/global/cscratch1/sd/sfarrell/conda/isc-ihpc/lib/python3.6/site-packages/h5py/__init__.py:36: '
           'from `float` to `np.floating` is deprecated. In future, it will be '
           'treated as `np.float64 == np.dtype(float).type`.\n'
           '  from ._conv import register_converters as _register_converters\n'
           'Using TensorFlow backend.\n',
 'stdout': '',
 'submitted': datetime.datetime(2018, 6, 21, 0, 30, 21, 27091

## Release job resources

#### Grab the job id for connecting to this cluster

In [None]:
%%bash -s "{job_name}" --out job_id
#capture the jobid to a variable
squeue -u $USER -n $1 | awk '{if (NR!=1) {printf "%s", $1}}'

#### Cancel the current job

In [None]:
%%bash -s "{job_id}"
scancel $1