# Keras + horovod + ipyparallel MNIST example

In this notebook we will use ipyparallel to deploy a Keras + Horovod distributed training example.

In [1]:
# System imports
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import

# External imports
import ipyparallel as ipp

## Connect to ipyparallel cluster

In [2]:
# Use default profile for now
c = ipp.Client()
print('Worker IDs:', c.ids)

## Initialize environment on the workers

In [4]:
%%px

from __future__ import print_function
from __future__ import division

import socket
import math

import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K
import tensorflow as tf

# Horovod for MPI synchronization routines
import horovod.keras as hvd

In [5]:
%%px

# Initialize horovod
hvd.init()
print('MPI rank %i, local rank %i, host %s' %
      (hvd.rank(), hvd.local_rank(), socket.gethostname()))

[stdout:0] MPI rank 3, local rank 0, host nid00027
[stdout:1] MPI rank 1, local rank 0, host nid00019
[stdout:2] MPI rank 2, local rank 0, host nid00020
[stdout:3] MPI rank 0, local rank 0, host nid00018


In [6]:
%%px

# Data config
n_classes = 10
img_rows, img_cols = 28, 28

# Training config
batch_size = 128
n_epochs = 8

## Load the data on each worker

In [7]:
%%px

(x_train, y_train), (x_test, y_test) = mnist.load_data()

if K.image_data_format() == 'channels_first':
    x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
    x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
    input_shape = (1, img_rows, img_cols)
else:
    x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
    x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
    input_shape = (img_rows, img_cols, 1)

# Scale pixels to [0, 1]
x_train = x_train.astype('float32') / 255
x_test = x_test.astype('float32') / 255

print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')

# Convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, n_classes)
y_test = keras.utils.to_categorical(y_test, n_classes)

[stdout:0] 
x_train shape: (60000, 28, 28, 1)
60000 train samples
10000 test samples
[stdout:1] 
x_train shape: (60000, 28, 28, 1)
60000 train samples
10000 test samples
[stdout:2] 
x_train shape: (60000, 28, 28, 1)
60000 train samples
10000 test samples
[stdout:3] 
x_train shape: (60000, 28, 28, 1)
60000 train samples
10000 test samples


## Define the model

In [8]:
%%px

model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3),
                 activation='relu',
                 input_shape=input_shape))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(n_classes, activation='softmax'))

# Adjust learning rate based on number of workers.
opt = keras.optimizers.Adadelta(1.0 * hvd.size())

# Add Horovod Distributed Optimizer.
opt = hvd.DistributedOptimizer(opt)

model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=opt,
              metrics=['accuracy'])

if hvd.rank() == 0:
    model.summary()

[stdout:3] 
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_5 (Conv2D)            (None, 26, 26, 32)        320       
_________________________________________________________________
conv2d_6 (Conv2D)            (None, 24, 24, 64)        18496     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 12, 12, 64)        0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 12, 12, 64)        0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 9216)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               1179776   
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0        

## Distributed training

Training with horovod + MPI allows for synchronous distributed batch updates.

We need to register the model synchronization callback and restrict checkpoint writing to a single worker.

In [9]:
%%px

callbacks = [
    # Horovod: broadcast initial variable states from rank 0 to all other processes.
    # This is necessary to ensure consistent initialization of all workers when
    # training is started with random weights or restored from a checkpoint.
    hvd.callbacks.BroadcastGlobalVariablesCallback(0),
]

# Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
#if hvd.rank() == 0:
#    callbacks.append(keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5'))

history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    callbacks=callbacks,
                    epochs=n_epochs,
                    verbose=2,
                    validation_data=(x_test, y_test))

[stdout:0] 
Train on 60000 samples, validate on 10000 samples
Epoch 1/8
 - 29s - loss: 0.1734 - acc: 0.9467 - val_loss: 0.0340 - val_acc: 0.9887
Epoch 2/8
 - 28s - loss: 0.0453 - acc: 0.9853 - val_loss: 0.0297 - val_acc: 0.9895
Epoch 3/8
 - 27s - loss: 0.0300 - acc: 0.9906 - val_loss: 0.0274 - val_acc: 0.9917
Epoch 4/8
 - 28s - loss: 0.0222 - acc: 0.9928 - val_loss: 0.0253 - val_acc: 0.9928
Epoch 5/8
 - 27s - loss: 0.0173 - acc: 0.9946 - val_loss: 0.0341 - val_acc: 0.9918
Epoch 6/8
 - 28s - loss: 0.0146 - acc: 0.9951 - val_loss: 0.0348 - val_acc: 0.9911
Epoch 7/8
 - 27s - loss: 0.0119 - acc: 0.9958 - val_loss: 0.0303 - val_acc: 0.9932
Epoch 8/8
 - 27s - loss: 0.0107 - acc: 0.9963 - val_loss: 0.0299 - val_acc: 0.9934
[stdout:1] 
Train on 60000 samples, validate on 10000 samples
Epoch 1/8
 - 28s - loss: 0.1730 - acc: 0.9472 - val_loss: 0.0340 - val_acc: 0.9887
Epoch 2/8
 - 28s - loss: 0.0455 - acc: 0.9859 - val_loss: 0.0297 - val_acc: 0.9895
Epoch 3/8
 - 27s - loss: 0.0284 - acc: 0.9908 

## Evaluate the model

In [10]:
%%px

score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

[stdout:0] 
Test loss: 0.029908974236061068
Test accuracy: 0.9934
[stdout:1] 
Test loss: 0.029908974236061068
Test accuracy: 0.9934
[stdout:2] 
Test loss: 0.029908974236061068
Test accuracy: 0.9934
[stdout:3] 
Test loss: 0.029908974236061068
Test accuracy: 0.9934
