# Distributed training of ATLAS RPV CNN Classifier

In this notebook, we extend the Train_rpv example to train distributed across nodes
using ipyparallel and Horovod.

* TODO: improve documentation.

In [1]:
# System imports
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import

# External imports
import ipyparallel as ipp

import matplotlib.pyplot as plt
%matplotlib notebook

## Connect to ipyparallel cluster

In [2]:
%%bash
squeue -u sfarrell

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
          13292538      resv       sh sfarrell  R       2:09     64 nid0[1273-1279,1281-1283,1285-1292,1294-1315,1317-1338,1340,1342]


In [3]:
# Cluster ID taken from job ID above
job_id = 13292538
cluster_id = 'cori_{}'.format(job_id)

# Use default profile
c = ipp.Client(timeout=60, cluster_id=cluster_id)
print('Worker IDs:', c.ids)

Worker IDs: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]


## Setup the workers

In [4]:
%%px

from __future__ import print_function
from __future__ import division

import os
import socket

import keras
import horovod.keras as hvd

from rpv import load_dataset, build_model, train_model

[stderr:0] 
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
[stderr:1] 
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
[stderr:2] 
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
[stderr:3] 
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
[stderr:4] 
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
[stderr:5] 
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
[stderr:6] 
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
[stderr:7] 
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
[stderr:8] 
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
[stderr:9] 
  from ._conv import register_converters as _register_convert

In [5]:
%%px

# Initialize horovod
hvd.init()
print('MPI rank %i, local rank %i, host %s' %
      (hvd.rank(), hvd.local_rank(), socket.gethostname()))

[stdout:0] MPI rank 0, local rank 0, host nid01273
[stdout:1] MPI rank 40, local rank 0, host nid01317
[stdout:2] MPI rank 42, local rank 0, host nid01319
[stdout:3] MPI rank 1, local rank 0, host nid01274
[stdout:4] MPI rank 26, local rank 0, host nid01302
[stdout:5] MPI rank 52, local rank 0, host nid01329
[stdout:6] MPI rank 27, local rank 0, host nid01303
[stdout:7] MPI rank 22, local rank 0, host nid01298
[stdout:8] MPI rank 19, local rank 0, host nid01295
[stdout:9] MPI rank 25, local rank 0, host nid01301
[stdout:10] MPI rank 29, local rank 0, host nid01305
[stdout:11] MPI rank 60, local rank 0, host nid01337
[stdout:12] MPI rank 7, local rank 0, host nid01281
[stdout:13] MPI rank 3, local rank 0, host nid01276
[stdout:14] MPI rank 13, local rank 0, host nid01288
[stdout:15] MPI rank 62, local rank 0, host nid01340
[stdout:16] MPI rank 23, local rank 0, host nid01299
[stdout:17] MPI rank 28, local rank 0, host nid01304
[stdout:18] MPI rank 4, local rank 0, host nid01277
[stdout:

## Load the data

In [6]:
%%px

# Data config
n_train = 64000 #412416
n_valid = 32000 #137471
n_test = 32000 #137471
input_dir = '/global/cscratch1/sd/sfarrell/atlas-rpv-images'

# Load the data files
((train_input, train_labels, train_weights),
 (valid_input, valid_labels, valid_weights),
 (test_input, test_labels, test_weights)) = load_dataset(input_dir, n_train, n_valid, n_test)
print('train shape:', train_input.shape)
print('valid shape:', valid_input.shape)
print('test shape: ', test_input.shape)

[stdout:0] 
train shape: (64000, 64, 64, 1)
valid shape: (32000, 64, 64, 1)
test shape:  (32000, 64, 64, 1)
[stdout:1] 
train shape: (64000, 64, 64, 1)
valid shape: (32000, 64, 64, 1)
test shape:  (32000, 64, 64, 1)
[stdout:2] 
train shape: (64000, 64, 64, 1)
valid shape: (32000, 64, 64, 1)
test shape:  (32000, 64, 64, 1)
[stdout:3] 
train shape: (64000, 64, 64, 1)
valid shape: (32000, 64, 64, 1)
test shape:  (32000, 64, 64, 1)
[stdout:4] 
train shape: (64000, 64, 64, 1)
valid shape: (32000, 64, 64, 1)
test shape:  (32000, 64, 64, 1)
[stdout:5] 
train shape: (64000, 64, 64, 1)
valid shape: (32000, 64, 64, 1)
test shape:  (32000, 64, 64, 1)
[stdout:6] 
train shape: (64000, 64, 64, 1)
valid shape: (32000, 64, 64, 1)
test shape:  (32000, 64, 64, 1)
[stdout:7] 
train shape: (64000, 64, 64, 1)
valid shape: (32000, 64, 64, 1)
test shape:  (32000, 64, 64, 1)
[stdout:8] 
train shape: (64000, 64, 64, 1)
valid shape: (32000, 64, 64, 1)
test shape:  (32000, 64, 64, 1)
[stdout:9] 
train shape: (64

## Build and train the model

In [7]:
%%px

# Model config
conv_sizes = [16, 32, 64]
fc_sizes = [128]
optimizer = 'Adam'
lr = 0.001 * hvd.size()
dropout=0.2

# Training config
batch_size = 128
n_epochs = 4

# Build the model
model = build_model(train_input.shape[1:],
                    conv_sizes=conv_sizes, fc_sizes=fc_sizes,
                    dropout=dropout,
                    optimizer=optimizer, lr=lr,
                    use_horovod=True)
if hvd.rank() == 0:
    model.summary()

[stdout:0] 
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 64, 64, 1)         0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 64, 64, 16)        160       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 32, 32, 16)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 32, 32, 32)        4640      
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 16, 16, 32)        0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 16, 16, 64)        18496     
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 8, 8, 64)          0        

In [8]:
%%px

# Train the model
history = train_model(model, train_input=train_input, train_labels=train_labels,
                      valid_input=valid_input, valid_labels=valid_labels,
                      batch_size=batch_size, n_epochs=n_epochs,
                      use_horovod=True)

[stdout:0] 
Train on 64000 samples, validate on 32000 samples
Epoch 1/4
 - 77s - loss: 6.8979 - acc: 0.5711 - val_loss: 6.8633 - val_acc: 0.5742
Epoch 2/4
 - 73s - loss: 6.9104 - acc: 0.5713 - val_loss: 6.8633 - val_acc: 0.5742
Epoch 3/4
 - 74s - loss: 6.9104 - acc: 0.5713 - val_loss: 6.8633 - val_acc: 0.5742
Epoch 4/4
 - 75s - loss: 6.9104 - acc: 0.5713 - val_loss: 6.8633 - val_acc: 0.5742
[stdout:1] 
Train on 64000 samples, validate on 32000 samples
Epoch 1/4
 - 77s - loss: 6.9007 - acc: 0.5710 - val_loss: 6.8633 - val_acc: 0.5742
Epoch 2/4
 - 74s - loss: 6.9104 - acc: 0.5713 - val_loss: 6.8633 - val_acc: 0.5742
Epoch 3/4
 - 73s - loss: 6.9104 - acc: 0.5713 - val_loss: 6.8633 - val_acc: 0.5742
Epoch 4/4
 - 75s - loss: 6.9104 - acc: 0.5713 - val_loss: 6.8633 - val_acc: 0.5742
[stdout:2] 
Train on 64000 samples, validate on 32000 samples
Epoch 1/4
 - 78s - loss: 6.8977 - acc: 0.5710 - val_loss: 6.8633 - val_acc: 0.5742
Epoch 2/4
 - 73s - loss: 6.9104 - acc: 0.5713 - val_loss: 6.8633 - 

In [None]:
# Can I get worker-local variables out?
epochs = c[0].get('history.epoch')
histories = c[:].get('history.history')

In [None]:
%matplotlib notebook

In [None]:
plt.figure(figsize=(9,4))

# Plot the loss
plt.subplot(121)
plt.plot(epochs, histories[0]['val_loss'], label='Validation')
for i, h in enumerate(histories):
    plt.plot(epochs, h['loss'])#, label='Train %i' % i)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(loc=0)

# Plot the accuracy
plt.subplot(122)
plt.plot(epochs, histories[0]['val_acc'], label='Validation')
for i, h in enumerate(histories):
    plt.plot(epochs, h['acc'])#, label='Train %i' % i)
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(loc=0)

plt.tight_layout()

## Evaluate on the test set

In [None]:
from sklearn import metrics

def summarize_metrics(labels, outputs, threshold=0.5, weights=None):
    preds = outputs > threshold
    #print('Metrics summaries with threshold of %.3f' % threshold)
    print('Accuracy:   %.4f' % metrics.accuracy_score(labels, preds, sample_weight=weights))
    print('Purity:     %.4f' % metrics.precision_score(labels, preds, sample_weight=weights))
    print('Efficiency: %.4f' % metrics.recall_score(labels, preds, sample_weight=weights))

def draw_roc(labels, outputs, weights=None, ax=None):
    fpr, tpr, _ = metrics.roc_curve(labels, outputs, sample_weight=weights)
    auc = metrics.roc_auc_score(labels, outputs, sample_weight=weights)
    if ax is None:
        fig, ax = plt.subplots()
    ax.plot(fpr, tpr, label='CNN, AUC=%.3f' % auc)
    ax.plot([0, 1], [0, 1], '--', label='Random')
    ax.set_xlabel('False positive rate')
    ax.set_ylabel('True positive rate')
    ax.legend(loc=0)

In [None]:
%%px

test_output = model.predict(test_input)
test_output = test_output.squeeze(-1)
score = model.evaluate(test_input, test_labels, verbose=2)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

In [None]:
test_output = c[0].get('test_output')
test_labels = c[0].get('test_labels')
test_weights = c[0].get('test_weights')

In [None]:
# Unweighted results
print('Unweighted metrics')
summarize_metrics(test_labels, test_output)
print()

# Weighted results
print('Weighted metrics')
summarize_metrics(test_labels, test_output, weights=test_weights)

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(9,4))
draw_roc(test_labels, test_output, ax=axs[0])
draw_roc(test_labels, test_output, ax=axs[1], weights=test_weights)
axs[0].set_xlim([0, 0.01])
axs[0].set_title('Unweighted')
axs[1].set_xlim([0, 0.01])
axs[1].set_title('Weighted');

In [None]:
# Visualize the model output
plt.figure()

# Select real/fake sample model outputs
real_idx = test_labels > 0.5
fake_idx = (real_idx == False)

binning=dict(bins=100, range=(0, 1), log=True)
plt.hist(test_output[fake_idx], weights=test_weights[fake_idx], label='Fake', **binning)
plt.hist(test_output[real_idx], weights=test_weights[real_idx], label='Real', **binning)
plt.legend(loc=0);