In [1]:
import os
import sys
sys.path.append('..')
os.environ['TF_XLA_FLAGS'] = "--tf_xla_auto_jit=fusible"
import tensorflow as tf
from tensorflow.keras.mixed_precision import experimental as mixed_precision

import horovod.tensorflow as hvd
hvd.init()

physical_devices = tf.config.list_physical_devices('GPU')
#tf.config.experimental.set_memory_growth(physical_devices[hvd.rank()], True)
tf.config.set_visible_devices(physical_devices[hvd.rank()], 'GPU')
devices = tf.config.list_logical_devices('GPU')

tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})
#policy = mixed_precision.Policy('mixed_float16')
#mixed_precision.set_policy(policy)

In [2]:
from tqdm import tqdm
from statistics import mean
import GPUtil

In [3]:
from mask_rcnn.tf2_model import MaskRCNN
from mask_rcnn.hyperparameters import dataset_params
from mask_rcnn.hyperparameters import mask_rcnn_params
from mask_rcnn import dataset_utils
from mask_rcnn.training import losses, learning_rates
from simple_model.tf2 import weight_loader, train, scheduler
from simple_model import model_v2

train_file_pattern = '/home/ubuntu/data/coco/tf_record/train*'
eval_file_pattern = '/home/ubuntu/data/coco/tf_record/val*'
batch_size = 4
global_batch_size = batch_size * hvd.size()
images = 118287
steps_per_epoch = images//global_batch_size
train_data_params = dataset_params.get_data_params()
eval_data_params = dataset_params.get_data_params()
params = mask_rcnn_params.default_config().values()
train_data_params['batch_size'] = batch_size
eval_data_params['batch_size'] = 1
params['finetune_bn'] = False
params['train_batch_size'] = batch_size
params['l2_weight_decay'] = 1e-4
params['init_learning_rate'] = 2e-3 * global_batch_size
params['warmup_learning_rate'] = 2e-4 * global_batch_size
params['warmup_steps'] = 4096//global_batch_size
params['learning_rate_steps'] = [steps_per_epoch * 9, steps_per_epoch * 11]
params['learning_rate_levels'] = [2e-4 * global_batch_size, 2e-5 * global_batch_size]
params['momentum'] = 0.9
params['use_batched_nms'] = False
params['use_custom_box_proposals_op'] = True
params['amp'] = True
params['include_groundtruth_in_features'] = True

In [4]:
train_loader = dataset_utils.FastDataLoader(train_file_pattern, train_data_params)
train_tdf = train_loader(train_data_params)
train_tdf = train_tdf.apply(tf.data.experimental.prefetch_to_device(devices[0].name, 
                                                                    buffer_size=tf.data.experimental.AUTOTUNE))
train_iter = iter(train_tdf)

[MaskRCNN] INFO    : Using Dataset Sharding with Horovod


In [5]:
eval_loader = dataset_utils.FastDataLoader(eval_file_pattern, eval_data_params)
eval_tdf = eval_loader(eval_data_params)
eval_tdf = eval_tdf.apply(tf.data.experimental.prefetch_to_device(devices[0].name, 
                                                                    buffer_size=tf.data.experimental.AUTOTUNE))
eval_iter = iter(eval_tdf)

[MaskRCNN] INFO    : Using Dataset Sharding with Horovod


In [6]:
mask_rcnn = model_v2.MRCNN(params)

In [7]:
features, labels = next(train_iter)
model_outputs = mask_rcnn(features, labels, params, is_training=True)

In [8]:
weight_loader.load_resnet_checkpoint(mask_rcnn, '../resnet/resnet-nhwc-2018-02-07/')

In [9]:
train.train_forward(features, labels, params, mask_rcnn)

<tf.Tensor: shape=(), dtype=float32, numpy=8.003389>

In [10]:
schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(params['learning_rate_steps'],
                                                                [params['init_learning_rate']] \
                                                                + params['learning_rate_levels'])
schedule = scheduler.WarmupScheduler(schedule, params['warmup_learning_rate'],
                                     params['warmup_steps'])
optimizer = tf.keras.optimizers.SGD(schedule, momentum=0.9)
optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, 'dynamic')

In [11]:
@tf.function
def train_step(features, labels, params, model, opt, first=False):
    with tf.GradientTape() as tape:
        total_loss = train.train_forward(features, labels, params, model)
        scaled_loss = optimizer.get_scaled_loss(total_loss)
    tape = hvd.DistributedGradientTape(tape)
    scaled_gradients = tape.gradient(scaled_loss, model.trainable_variables)
    gradients = optimizer.get_unscaled_gradients(scaled_gradients)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    if first:
        hvd.broadcast_variables(model.variables, 0)
        hvd.broadcast_variables(opt.variables(), root_rank=0)
    return total_loss

In [12]:
_ = train_step(features, labels, params, mask_rcnn, optimizer, first=True)

In [None]:
if hvd.rank()==0:
    p_bar = tqdm(range(steps_per_epoch//2))
    loss_history = []
else:
    p_bar = range(steps_per_epoch//2)
for i in p_bar:
    features, labels = next(train_iter)
    total_loss = train_step(features, labels, params, mask_rcnn, optimizer)
    if hvd.rank()==0:
        loss_history.append(total_loss.numpy())
        smoothed_loss = mean(loss_history[-50:])
        p_bar.set_description("Loss: {0:.4f}, LR: {1:.4f}".format(smoothed_loss, 
                                                                  schedule(optimizer.iterations)))

Loss: 1.6630, LR: 0.0080:  72%|███████▏  | 10647/14785 [37:37<14:26,  4.78it/s] 

In [None]:
@tf.function
def pred(features, params):
    out = mask_rcnn(features, None, params, is_training=False)
    out['image_info'] = features['image_info']
    out['source_id'] = features['source_ids']
    return out

In [16]:
gpu_mem = GPUtil.getGPUs()[0].memoryUsed

In [17]:
gpu_mem

15664.0

In [18]:
# mem 9420.0

In [19]:
some_predictions = []
for i in tqdm(range(5000)):
    eval_features, eval_labels = next(eval_iter)
    some_predictions.append(pred(eval_features, params))
    #gpu_mem = GPUtil.getGPUs()[0].memoryUsed
    '''if gpu_mem>12000:
        print("stopping at iteration {}".format(i))
        break'''

100%|██████████| 5000/5000 [03:50<00:00, 21.70it/s]


In [21]:
eval_features['source_ids']

<tf.Tensor: shape=(1,), dtype=int64, numpy=array([178744])>

In [None]:
#[270402, 458255,  61171, 469828]
#[110638]