In [None]:
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
# List of changes:
# - loading habana module
# - added support for prefetching to HPU
# - added profiling callbacks support
# - changed include paths of modules
# - flag for setting tensorflow global seed
# - include mechanism for dumping tensors

# Copyright (C) 2020-2021 Habana Labs, Ltd. an Intel Company

# TensorFlow Keras ResNet50 Example on Habana Gaudi<sup>TM</sup>

##### This example demonstrates how to train Keras ResNet50 on Habana Gaudi<sup>TM</sup> device with TensorFlow framework. The neural network is built with Keras APIs, and trained with synthetic data.

## Load TensorBoard extension for Jupyter notebook

In [3]:
%load_ext tensorboard

## Imports

In [4]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from absl import app
from absl import flags
from absl import logging

import tensorflow as tf

### Import Python packages from Habana Model-References github repository

In [5]:
from TensorFlow.common.modeling import performance
from TensorFlow.common.training import controller
from TensorFlow.common.library_loader import load_habana_module
from TensorFlow.common.debug import dump_callback
from TensorFlow.common.horovod_helpers import synapse_logger_init
from TensorFlow.common.tb_utils import write_hparams_v2

from TensorFlow.utils.logs import logger
from TensorFlow.utils.misc import distribution_utils
from TensorFlow.utils.misc import keras_utils
from TensorFlow.utils.misc import model_helpers

from TensorFlow.computer_vision.common import imagenet_preprocessing
from TensorFlow.computer_vision.Resnets.resnet_keras.local_flags import core as flags_core
from TensorFlow.computer_vision.Resnets.utils.optimizers.keras import lars_util
from TensorFlow.computer_vision.Resnets.resnet_keras import common
from TensorFlow.computer_vision.Resnets.resnet_keras import resnet_runnable
from TensorFlow.computer_vision.Resnets.resnet_keras.common import adjust_batch_size

from central.habana_model_yaml_config import HabanaModelYamlConfig
from central.habana_model_runner_utils import HabanaEnvVariables

## Define training arguments

In [6]:
logging.set_verbosity(logging.INFO)
# define keras args
common.define_keras_flags()
# define habana args
common.define_habana_flags()
# define LARS args
lars_util.define_lars_flags()

## Define function to parse arguments from yaml configuration file

In [7]:
def parse_args_yaml_config(config_file):
    yaml_config = HabanaModelYamlConfig('resnet_keras', config_file)
    
    env_args = yaml_config.get_env_vars()
    model_params=yaml_config.get_parameters()
    
    cmd_args = []
    exclude_fields = ['use_horovod', 'num_workers_per_hls', 'hls_type']
    yaml_config.add_parameters_except(cmd_args, exclude_fields)
    
    return env_args, cmd_args

## Define functions for training

In [8]:
def build_stats(runnable, time_callback):
    """Normalizes and returns dictionary of stats.

    Args:
      runnable: The module containing all the training and evaluation metrics.
      time_callback: Time tracking callback instance.

    Returns:
      Dictionary of normalized results.
    """
    stats = {}

    if not runnable.flags_obj.skip_eval:
        stats['eval_loss'] = runnable.test_loss.result().numpy()
        stats['eval_acc'] = runnable.test_accuracy.result().numpy()

        stats['train_loss'] = runnable.train_loss.result().numpy()
        stats['train_acc'] = runnable.train_accuracy.result().numpy()

    if time_callback:
        timestamp_log = time_callback.timestamp_log
        stats['step_timestamp_log'] = timestamp_log
        stats['train_finish_time'] = time_callback.train_finish_time
        if time_callback.epoch_runtime_log:
            stats['avg_exp_per_second'] = time_callback.average_examples_per_second

    return stats

In [9]:
def get_num_train_iterations(flags_obj):
    """Returns the number of training steps, train and test epochs."""
    
    train_steps = (
        imagenet_preprocessing.NUM_IMAGES['train'] // adjust_batch_size(flags_obj.batch_size))
    train_epochs = flags_obj.train_epochs

    if flags_obj.train_steps:
        train_steps = min(flags_obj.train_steps, train_steps)
        train_epochs = 1

    eval_steps = (
        imagenet_preprocessing.NUM_IMAGES['validation'] // flags_obj.batch_size)

    return train_steps, train_epochs, eval_steps


In [10]:
def _steps_to_run(steps_in_current_epoch, steps_per_epoch, steps_per_loop):
    """Calculates steps to run on device."""
    
    if steps_per_loop <= 0:
        raise ValueError('steps_per_loop should be positive integer.')
    if steps_per_loop == 1:
        return steps_per_loop
    return min(steps_per_loop, steps_per_epoch - steps_in_current_epoch)


## Define function to run training

In [11]:
def run(flags_obj):
    """Run ResNet50 training with synthetic data and eval loop using custom training loops.

    Args:
      flags_obj: An object containing parsed flag values.

    Raises:
      ValueError: If fp16 is passed as it is not currently supported.

    Returns:
      Dictionary of training and eval stats.
    """
    
    keras_utils.set_session_config(
        enable_eager=flags_obj.enable_eager,
        enable_xla=flags_obj.enable_xla)
    performance.set_mixed_precision_policy(flags_core.get_tf_dtype(flags_obj))

    # set data format
    data_format = flags_obj.data_format
    
    if data_format is None:
        data_format = ('channels_first'
                       if tf.test.is_built_with_cuda() else 'channels_last')
    tf.keras.backend.set_image_data_format(data_format)

    batch_size = flags_obj.batch_size
    model_dir = flags_obj.model_dir

    strategy = distribution_utils.get_distribution_strategy(
        distribution_strategy=flags_obj.distribution_strategy,
        num_gpus=flags_obj.num_gpus,
        all_reduce_alg=flags_obj.all_reduce_alg,
        num_packs=flags_obj.num_packs,
        tpu_address=flags_obj.tpu)
    
    train_writer, eval_writer = None, None
    if flags_obj.enable_tensorboard:
        import os
        train_writer = tf.summary.create_file_writer(model_dir)
        eval_writer = tf.summary.create_file_writer(os.path.join(model_dir, 'eval'))
        write_hparams_v2(train_writer, flags_obj.flag_values_dict())
    

    per_epoch_steps, train_epochs, eval_steps = get_num_train_iterations(
        flags_obj)
    steps_per_loop = min(flags_obj.steps_per_loop, per_epoch_steps)
    train_steps = train_epochs * per_epoch_steps

    logging.info(
        'Training %d epochs, each epoch has %d steps, '
        'total steps: %d; Eval %d steps', train_epochs, per_epoch_steps,
        train_steps, eval_steps)
    
    time_callback = keras_utils.TimeHistory(
        batch_size,
        flags_obj.log_steps,
        summary_writer=train_writer,
        batch_size_per_node=flags_obj.batch_size)
    
    profiler_callback = None
    if flags_obj.profile_steps is not None:
        profiler_callback = keras_utils.get_profiler_callback(
            model_dir,
            flags_obj.profile_steps,
            flags_obj.enable_tensorboard,
            per_epoch_steps)
    with distribution_utils.get_strategy_scope(strategy):
        runnable = resnet_runnable.ResnetRunnable(flags_obj, time_callback,
                                                  train_steps,
                                                  per_epoch_steps,
                                                  profiler_callback)

    eval_interval = flags_obj.epochs_between_evals * per_epoch_steps
    checkpoint_interval = (
        per_epoch_steps if flags_obj.enable_checkpoint_and_export else None)
    summary_interval = per_epoch_steps if flags_obj.enable_tensorboard else None

    checkpoint_manager = tf.train.CheckpointManager(
        runnable.checkpoint,
        directory=model_dir,
        max_to_keep=10,
        step_counter=runnable.global_step,
        checkpoint_interval=checkpoint_interval)

    train_steps=per_epoch_steps * train_epochs

    resnet_controller = controller.Controller(
        strategy,
        runnable.train,
        runnable.evaluate,
        global_step=runnable.global_step,
        steps_per_loop=steps_per_loop,
        train_steps=train_steps,
        checkpoint_manager=checkpoint_manager,
        summary_interval=summary_interval,
        eval_steps=eval_steps,
        eval_interval=eval_interval)

    time_callback.on_train_begin()
    resnet_controller.train(evaluate=not flags_obj.skip_eval)
    time_callback.on_train_end()

    stats = build_stats(runnable, time_callback)
    return stats


## Parse the training arguments from yaml config file

In [12]:
env_args, cmd_args = parse_args_yaml_config('resnet50_keras_lars_bf16_1card.yaml')

cmd_args.insert(0, ' ') # workaround cmd line args 
argv = flags.FLAGS(cmd_args)

resnet_keras
{'TF_ENABLE_BF16_CONVERSION': 1, 'USE_LARS_OPTIMIZER': 1, 'TF_ALLOW_CONTROL_EDGES_IN_HABANA_OPS': 1, 'HABANA_NCCL_COMM_API': True, 'TF_DISABLE_MKL': 1}
{'use_synthetic_data': True, 'skip_eval': True, 'batch_size': 256, 'steps_per_loop': 100, 'log_steps': 200, 'model_dir': '/root/tmp/resnet/', 'enable_tensorboard': True, 'data_format': 'channels_last', 'train_steps': 1000, 'use_horovod': False, 'optimizer': 'LARS', 'lr_schedule': 'polynomial', 'data_loader_image_type': 'bf16', 'weight_decay': 0.0001, 'label_smoothing': 0.1, 'base_learning_rate': 2.5, 'warmup_epochs': 3, 'distribution_strategy': 'off', 'num_gpus': 0, 'single_l2_loss_op': True}


## Initialize preloading libraries and Synapse logger API

In [13]:
common.initialize_preloading()
# initialize synapse logger
synapse_logger_init()

## Load Habana TensorFlow modules and aquire Habana Gaudi device

In [14]:
log_info_devices = load_habana_module()
logging.info('Devices:\n%s', log_info_devices)

2021-09-08 03:30:50.205345: I tensorflow/core/framework/kernel_def_builder.cc:43] 0
2021-09-08 03:30:50.205402: I tensorflow/core/framework/kernel_def_builder.cc:43] 1
2021-09-08 03:30:50.205433: I tensorflow/core/framework/kernel_def_builder.cc:43] 0
2021-09-08 03:30:50.205439: I tensorflow/core/framework/kernel_def_builder.cc:43] 1
2021-09-08 03:30:50.205452: I tensorflow/core/framework/kernel_def_builder.cc:43] 0
2021-09-08 03:30:50.205457: I tensorflow/core/framework/kernel_def_builder.cc:43] 1
2021-09-08 03:30:50.205471: I tensorflow/core/framework/kernel_def_builder.cc:43] 0
2021-09-08 03:30:50.205476: I tensorflow/core/framework/kernel_def_builder.cc:43] 1
2021-09-08 03:30:50.205487: I tensorflow/core/framework/kernel_def_builder.cc:43] 0
2021-09-08 03:30:50.205492: I tensorflow/core/framework/kernel_def_builder.cc:43] 1
2021-09-08 03:30:50.205507: I tensorflow/core/framework/kernel_def_builder.cc:43] 0
2021-09-08 03:30:50.205512: I tensorflow/core/framework/kernel_def_builder.c

## Launch ResNet50 training with LARS optimizer

In [15]:
if flags.FLAGS.global_seed:
    tf.random.set_seed(flags.FLAGS.global_seed)

with HabanaEnvVariables(env_args):
    with dump_callback():
        model_helpers.apply_clean(flags.FLAGS)
        with logger.benchmark_context(flags.FLAGS):
            stats =run(flags.FLAGS)
        logging.info('Run stats:\n%s', stats)

2021-09-08 03:30:53.214090: W /home/jenkins/workspace/cdsoftwarebuilder/create-tensorflow-module---bpt-d/tensorflow-training/habana_device/habana_device.cpp:162] Init done for library version 1.0.0_7ec4652c_tf2.5.0


INFO:tensorflow:Type of parameter "logger_levels" is not one of (bool, int, float, str). It will be saved as a string.


INFO:tensorflow:Type of parameter "logger_levels" is not one of (bool, int, float, str). It will be saved as a string.


INFO:tensorflow:Type of parameter "profile_file" is not one of (bool, int, float, str). It will be saved as a string.


INFO:tensorflow:Type of parameter "profile_file" is not one of (bool, int, float, str). It will be saved as a string.


INFO:tensorflow:Type of parameter "dump_config" is not one of (bool, int, float, str). It will be saved as a string.


INFO:tensorflow:Type of parameter "dump_config" is not one of (bool, int, float, str). It will be saved as a string.


INFO:tensorflow:Type of parameter "shuffle_seed" is not one of (bool, int, float, str). It will be saved as a string.


INFO:tensorflow:Type of parameter "shuffle_seed" is not one of (bool, int, float, str). It will be saved as a string.


INFO:tensorflow:Type of parameter "random_flip_left_right_seed" is not one of (bool, int, float, str). It will be saved as a string.


INFO:tensorflow:Type of parameter "random_flip_left_right_seed" is not one of (bool, int, float, str). It will be saved as a string.


INFO:tensorflow:Type of parameter "ls" is not one of (bool, int, float, str). It will be saved as a string.


INFO:tensorflow:Type of parameter "ls" is not one of (bool, int, float, str). It will be saved as a string.


INFO:tensorflow:Type of parameter "loss_scale" is not one of (bool, int, float, str). It will be saved as a string.


INFO:tensorflow:Type of parameter "loss_scale" is not one of (bool, int, float, str). It will be saved as a string.


INFO:tensorflow:Type of parameter "ara" is not one of (bool, int, float, str). It will be saved as a string.


INFO:tensorflow:Type of parameter "ara" is not one of (bool, int, float, str). It will be saved as a string.


INFO:tensorflow:Type of parameter "all_reduce_alg" is not one of (bool, int, float, str). It will be saved as a string.


INFO:tensorflow:Type of parameter "all_reduce_alg" is not one of (bool, int, float, str). It will be saved as a string.


INFO:tensorflow:Type of parameter "gt_mode" is not one of (bool, int, float, str). It will be saved as a string.


INFO:tensorflow:Type of parameter "gt_mode" is not one of (bool, int, float, str). It will be saved as a string.


INFO:tensorflow:Type of parameter "tf_gpu_thread_mode" is not one of (bool, int, float, str). It will be saved as a string.


INFO:tensorflow:Type of parameter "tf_gpu_thread_mode" is not one of (bool, int, float, str). It will be saved as a string.


INFO:tensorflow:Type of parameter "datasets_num_private_threads" is not one of (bool, int, float, str). It will be saved as a string.


INFO:tensorflow:Type of parameter "datasets_num_private_threads" is not one of (bool, int, float, str). It will be saved as a string.


INFO:tensorflow:Type of parameter "bti" is not one of (bool, int, float, str). It will be saved as a string.


INFO:tensorflow:Type of parameter "bti" is not one of (bool, int, float, str). It will be saved as a string.


INFO:tensorflow:Type of parameter "benchmark_test_id" is not one of (bool, int, float, str). It will be saved as a string.


INFO:tensorflow:Type of parameter "benchmark_test_id" is not one of (bool, int, float, str). It will be saved as a string.


INFO:tensorflow:Type of parameter "bld" is not one of (bool, int, float, str). It will be saved as a string.


INFO:tensorflow:Type of parameter "bld" is not one of (bool, int, float, str). It will be saved as a string.


INFO:tensorflow:Type of parameter "benchmark_log_dir" is not one of (bool, int, float, str). It will be saved as a string.


INFO:tensorflow:Type of parameter "benchmark_log_dir" is not one of (bool, int, float, str). It will be saved as a string.


INFO:tensorflow:Type of parameter "gp" is not one of (bool, int, float, str). It will be saved as a string.


INFO:tensorflow:Type of parameter "gp" is not one of (bool, int, float, str). It will be saved as a string.


INFO:tensorflow:Type of parameter "gcp_project" is not one of (bool, int, float, str). It will be saved as a string.


INFO:tensorflow:Type of parameter "gcp_project" is not one of (bool, int, float, str). It will be saved as a string.


INFO:tensorflow:Type of parameter "worker_hosts" is not one of (bool, int, float, str). It will be saved as a string.


INFO:tensorflow:Type of parameter "worker_hosts" is not one of (bool, int, float, str). It will be saved as a string.


INFO:tensorflow:Type of parameter "profile_steps" is not one of (bool, int, float, str). It will be saved as a string.


INFO:tensorflow:Type of parameter "profile_steps" is not one of (bool, int, float, str). It will be saved as a string.


INFO:tensorflow:Type of parameter "global_seed" is not one of (bool, int, float, str). It will be saved as a string.


INFO:tensorflow:Type of parameter "global_seed" is not one of (bool, int, float, str). It will be saved as a string.


INFO:tensorflow:Type of parameter "end_learning_rate" is not one of (bool, int, float, str). It will be saved as a string.


INFO:tensorflow:Type of parameter "end_learning_rate" is not one of (bool, int, float, str). It will be saved as a string.
INFO:absl:Training 1 epochs, each epoch has 1000 steps, total steps: 1000; Eval 195 steps
INFO:absl:Train at step 0 of 1000
INFO:absl:Entering training loop with 100 steps, at step 0 of 1000


Instructions for updating:
rename to distribute_datasets_from_function


Instructions for updating:
rename to distribute_datasets_from_function
2021-09-08 03:31:04.389583: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2021-09-08 03:31:04.718445: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 2700000000 Hz
INFO:absl:step: 100        steps_per_second: 1.05        {'loss': 4.1505466, 'accuracy': 0.64}
INFO:absl:Entering training loop with 100 steps, at step 100 of 1000


step: 100        steps_per_second: 1.05        {'loss': 4.1505466, 'accuracy': 0.64}


INFO:absl:TimeHistory: 111.84 seconds, 457.80 examples/second between steps 0 and 200
INFO:absl:step: 200        steps_per_second: 6.03        {'loss': 1.0158594, 'accuracy': 1.0}
INFO:absl:Entering training loop with 100 steps, at step 200 of 1000


step: 200        steps_per_second: 6.03        {'loss': 1.0158594, 'accuracy': 1.0}


INFO:absl:step: 300        steps_per_second: 6.06        {'loss': 1.015625, 'accuracy': 1.0}
INFO:absl:Entering training loop with 100 steps, at step 300 of 1000


step: 300        steps_per_second: 6.06        {'loss': 1.015625, 'accuracy': 1.0}


INFO:absl:TimeHistory: 33.01 seconds, 1550.90 examples/second between steps 200 and 400
INFO:absl:step: 400        steps_per_second: 6.05        {'loss': 1.015625, 'accuracy': 1.0}
INFO:absl:Entering training loop with 100 steps, at step 400 of 1000


step: 400        steps_per_second: 6.05        {'loss': 1.015625, 'accuracy': 1.0}


INFO:absl:step: 500        steps_per_second: 6.06        {'loss': 1.015625, 'accuracy': 1.0}
INFO:absl:Entering training loop with 100 steps, at step 500 of 1000


step: 500        steps_per_second: 6.06        {'loss': 1.015625, 'accuracy': 1.0}


INFO:absl:TimeHistory: 33.06 seconds, 1548.67 examples/second between steps 400 and 600
INFO:absl:step: 600        steps_per_second: 6.04        {'loss': 1.015625, 'accuracy': 1.0}
INFO:absl:Entering training loop with 100 steps, at step 600 of 1000


step: 600        steps_per_second: 6.04        {'loss': 1.015625, 'accuracy': 1.0}


INFO:absl:step: 700        steps_per_second: 6.05        {'loss': 1.015625, 'accuracy': 1.0}
INFO:absl:Entering training loop with 100 steps, at step 700 of 1000


step: 700        steps_per_second: 6.05        {'loss': 1.015625, 'accuracy': 1.0}


INFO:absl:TimeHistory: 33.05 seconds, 1549.07 examples/second between steps 600 and 800
INFO:absl:step: 800        steps_per_second: 6.05        {'loss': 1.015625, 'accuracy': 1.0}
INFO:absl:Entering training loop with 100 steps, at step 800 of 1000


step: 800        steps_per_second: 6.05        {'loss': 1.015625, 'accuracy': 1.0}


INFO:absl:step: 900        steps_per_second: 6.06        {'loss': 1.0164844, 'accuracy': 1.0}
INFO:absl:Entering training loop with 100 steps, at step 900 of 1000


step: 900        steps_per_second: 6.06        {'loss': 1.0164844, 'accuracy': 1.0}


INFO:absl:TimeHistory: 33.03 seconds, 1550.09 examples/second between steps 800 and 1000
INFO:absl:step: 1000        steps_per_second: 6.05        {'loss': 1.0177344, 'accuracy': 1.0}
INFO:absl:Run stats:
{'step_timestamp_log': ['BatchTimestamp<batch_index: 0, timestamp: 1631071857.7869437>', 'BatchTimestamp<batch_index: 200, timestamp: 1631071969.626038>', 'BatchTimestamp<batch_index: 400, timestamp: 1631072002.6641362>', 'BatchTimestamp<batch_index: 600, timestamp: 1631072035.7503839>', 'BatchTimestamp<batch_index: 800, timestamp: 1631072068.8265276>', 'BatchTimestamp<batch_index: 1000, timestamp: 1631072101.8778355>'], 'train_finish_time': 1631072101.8968284, 'avg_exp_per_second': 1048.7455096379886}


step: 1000        steps_per_second: 6.05        {'loss': 1.0177344, 'accuracy': 1.0}


## Load TensorBoard to display training results

In [16]:
%tensorboard --bind_all --logdir /root/tmp/resnet/