# Tune Tutorial

In [1]:
from helper import load_data
import numpy as np

Using TensorFlow backend.


We want to start off by creating a model:

In [2]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D

def make_model(args):
    num_classes = 10
    
    model = Sequential()
    model.add(Conv2D(32, kernel_size=(args.kernel1, args.kernel1),
                     activation='relu', input_shape=(28, 28, 1)))
    model.add(Conv2D(64, (args.kernel2, args.kernel2), activation='relu'))
    model.add(MaxPooling2D(pool_size=(args.poolsize, args.poolsize)))
    model.add(Dropout(args.dropout1))
    model.add(Flatten())
    model.add(Dense(args.hidden, activation='relu'))
    model.add(Dropout(args.dropout2))
    model.add(Dense(num_classes, activation='softmax'))

    model.compile(loss=keras.losses.categorical_crossentropy,
                  optimizer=keras.optimizers.SGD(
                      lr=args.lr, momentum=args.momentum),
                  metrics=['accuracy'])
    return model

def train_mnist(args):
    x_train, x_test, y_train, y_test = load_data()
    model = make_model(args)
    model.fit(x_train, y_train,
              verbose=1,
              validation_data=(x_test, y_test),
              callbacks=[])

*Then*, we want to train this model (try out default hyperparameters)

Now, let's try running a simple search to find the best hyperparameters

In [10]:
import argparse
parser = argparse.ArgumentParser(description='Keras MNIST Example')
parser.add_argument('--steps', type=float, default=0.01, metavar='LR',
                    help='learning rate (default: 0.01)')
parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
                    help='learning rate (default: 0.01)')
parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
                    help='SGD momentum (default: 0.5)')
parser.add_argument('--kernel1', type=int, default=3,
                    help='Size of first kernel (default: 3)')
parser.add_argument('--kernel2', type=int, default=3,
                    help='Size of second kernel (default: 3)')
parser.add_argument('--poolsize', type=int, default=2,
                    help='Size of Pooling (default: 2)')
parser.add_argument('--dropout1', type=float, default=0.25,
                    help='Size of first kernel (default: 0.25)')
parser.add_argument('--hidden', type=int, default=128,
                    help='Size of Hidden Layer (default: 128)')
parser.add_argument('--dropout2', type=float, default=0.5,
                    help='Size of first kernel (default: 0.5)')

args = parser.parse_known_args()[0]

In [21]:
train_mnist(args)

x_train shape: (60000, 28, 28, 1)
60000 train samples
10000 test samples
Train on 60000 samples, validate on 10000 samples
Epoch 1/12

KeyboardInterrupt: 

Now, let's use this machine with CPUs and multiplex our training to find the best parameters using a single machine.

In [4]:
import ray
from ray import tune
from helper import TuneCallback

In [31]:
def train_mnist_tune(args, config, reporter):
    vars(args).update(config) #add this
    x_train, x_test, y_train, y_test = load_data(limit_threads=4) #add this
    model = make_model(args)
    model.fit(x_train, y_train,
              verbose=1,
              validation_data=(x_test, y_test),
              callbacks=[TuneCallback(reporter)]) #add this

In [36]:
ray.init(ignore_reinit_error=True)
tune.register_trainable(
    "train_mnist", lambda config, reporter: train_mnist_tune(args, config, reporter))

configuration = tune.Experiment(
    "experiment_name",
    stop={"mean_accuracy": 0.99},
    run="train_mnist",
    config={
        "lr": lambda spec: np.random.uniform(0.001, 0.1),
        "momentum": lambda spec: np.random.uniform(0.1, 0.9),
        "hidden": lambda spec: np.random.randint(32, 512),
        "dropout1": lambda spec: np.random.uniform(0.2, 0.8),
    }
)
tune.run_experiments(configuration)

Calling ray.init() again after it has already been called.
== Status ==
Using FIFO scheduling algorithm.


Created LogSyncer for /Users/rliaw/ray_results/experiment_name/train_mnist_0_dropout1=0.68217,hidden=297,lr=0.073576,momentum=0.69057_2018-09-27_20-25-09swpljiao -> 
== Status ==
Using FIFO scheduling algorithm.
Resources requested: 1/8 CPUs, 0/0 GPUs
Result logdir: /Users/rliaw/ray_results/experiment_name
RUNNING trials:
 - train_mnist_0_dropout1=0.68217,hidden=297,lr=0.073576,momentum=0.69057:	RUNNING

Result for train_mnist_0_dropout1=0.68217,hidden=297,lr=0.073576,momentum=0.69057:
  date: 2018-09-27_20-25-14
  done: false
  experiment_id: 971d502824fc41819c007254d48b6206
  hostname: C02TX1VXHTDD
  iterations_since_restore: 1
  mean_accuracy: 0.4375
  node_ip: 127.0.0.1
  pid: 33371
  time_since_restore: 2.0018250942230225
  time_this_iter_s: 2.0018250942230225
  time_total_s: 2.0018250942230225
  timestamp: 1538105114
  timesteps_since_restore: 0
  training_iteration: 1
  
==

[train_mnist_0_dropout1=0.68217,hidden=297,lr=0.073576,momentum=0.69057]

In [15]:
class Model(tune.Trainable):
    def _setup(self):
        vars(args).update(self.config) #add this
        self.model = make_model(args)
        self.data = load_data(limit_threads=4)
    
    def _train(self):
        x_train, x_test, y_train, y_test = self.data
        result = self.model.fit(x_train, y_train,
              verbose=1,
              validation_data=(x_test, y_test)) #add this
        return {"mean_accuracy": result.history["acc"][0]}
    
    def _save(self, checkpoint_dir):
        checkpoint_path = os.path.join(checkpoint_dir, "weights.h5")
        self.model.save_weights(checkpoint_path)
    
    def _restore(self, checkpoint_path):
        self.model.load_weights(checkpoint_path)

In [14]:
ray.init(ignore_reinit_error=True)
configuration = tune.Experiment(
    "experiment_name",
    stop={"mean_accuracy": 0.99},
    run=Model,
    config={
        "lr": lambda spec: np.random.uniform(0.001, 0.1),
        "momentum": lambda spec: np.random.uniform(0.1, 0.9),
        "hidden": lambda spec: np.random.randint(32, 512),
        "dropout1": lambda spec: np.random.uniform(0.2, 0.8),
    },
    checkpoint_at_end=True
)
tune.run_experiments(configuration)

Calling ray.init() again after it has already been called.
== Status ==
Using FIFO scheduling algorithm.


Created LogSyncer for /Users/rliaw/ray_results/experiment_name/Model_0_dropout1=0.59719,hidden=288,lr=0.044971,momentum=0.33499_2018-09-27_23-00-1552ds5byk -> 
== Status ==
Using FIFO scheduling algorithm.
Resources requested: 1/8 CPUs, 0/0 GPUs
Result logdir: /Users/rliaw/ray_results/experiment_name
RUNNING trials:
 - Model_0_dropout1=0.59719,hidden=288,lr=0.044971,momentum=0.33499:	RUNNING

Remote function [31mtrain[39m failed with:

Traceback (most recent call last):
  File "/Users/rliaw/Research/riselab/ray/python/ray/worker.py", line 945, in _process_task
    *arguments)
  File "/Users/rliaw/Research/riselab/ray/python/ray/actor.py", line 261, in actor_method_executor
    method_returns = method(actor, *args)
  File "/Users/rliaw/Research/riselab/ray/python/ray/tune/trainable.py", line 143, in train
    result = self._train()
  File "<ipython-input-13-d3e5934c35d8>", line 1

TuneError: ('Trials did not complete', [Model_0_dropout1=0.59719,hidden=288,lr=0.044971,momentum=0.33499])