# SageMaker Debugger rules

Source
- https://sagemaker.readthedocs.io/en/stable/amazon_sagemaker_debugger.html
- https://gitlab.com/juliensimon/dlnotebooks/blob/master/keras/05-keras-blog-post/Fashion%20MNIST-SageMaker.ipynb
- https://github.com/aws/amazon-sagemaker-examples/blob/master/sagemaker-debugger/tensorflow_profiling/tf-resnet-profiling-single-gpu-single-node.ipynb

In [1]:
# install_needed = True
install_needed = False

In [2]:
import sys
import IPython

if install_needed:
    print("installing deps and restarting kernel")
    !{sys.executable} -m pip install -U 'sagemaker[local]'
    !{sys.executable} -m pip install -U sagemaker-experiments # SageMaker Experiments SDK 
    !{sys.executable} -m pip install -U sagemaker             # SageMaker Python SDK
#     !/bin/bash ./local/local_mode_setup.sh
    IPython.Application.instance().kernel.do_shutdown(True)

In [3]:
# from IPython.display import Image
# Image("fashion-mnist-sprite.png")

In [5]:
import sagemaker

from smexperiments.experiment import Experiment ### SM Experiment
from smexperiments.trial import Trial           ### SM Experiment

from sagemaker.debugger import (
    Rule,
    DebuggerHookConfig,
    TensorBoardOutputConfig,
    CollectionConfig,
    rule_configs,
    ProfilerRule
)

from time import strftime

import boto3

boto_session = boto3.session.Session()
region = boto_session.region_name

sess = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sess.default_bucket()
prefix = 'keras-fashion-mnist-debugger'

output_path = 's3://{}/{}/output'.format(bucket, prefix)
s3_input_path = 's3://{}/{}/data'.format(bucket, prefix)
training_input_path = s3_input_path + '/train'
validation_input_path = s3_input_path + '/val'

## Download the Fashion-MNIST dataset

In [10]:
import os
import numpy as np
import tensorflow as tf

fashion_mnist = tf.keras.datasets.fashion_mnist
(x_train, y_train), (x_val, y_val) = fashion_mnist.load_data()

os.makedirs("./data/train", exist_ok = True)
os.makedirs("./data/val", exist_ok = True)

np.save('./data/train/x_train.npy', x_train)
np.save('./data/train/y_train.npy', y_train)
np.save('./data/val/x_val.npy', x_val)
np.save('./data/val/y_val.npy', y_val)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz


In [11]:
!ls -lat ./data

total 16
drwxrwxr-x 2 ec2-user ec2-user 4096 Dec 14 15:02 val
drwxrwxr-x 2 ec2-user ec2-user 4096 Dec 14 15:02 train
drwxrwxr-x 4 ec2-user ec2-user 4096 Dec 14 15:02 .
drwxrwxr-x 4 ec2-user ec2-user 4096 Dec 14 15:02 ..


##  Upload Fashion-MNIST data to S3

In [12]:
!aws s3 sync ./data {s3_input_path}

upload: data/val/y_val.npy to s3://sagemaker-ap-northeast-2-889750940888/keras-fashion-mnist-debugger/data/val/y_val.npy
upload: data/train/y_train.npy to s3://sagemaker-ap-northeast-2-889750940888/keras-fashion-mnist-debugger/data/train/y_train.npy
upload: data/val/x_val.npy to s3://sagemaker-ap-northeast-2-889750940888/keras-fashion-mnist-debugger/data/val/x_val.npy
upload: data/train/x_train.npy to s3://sagemaker-ap-northeast-2-889750940888/keras-fashion-mnist-debugger/data/train/x_train.npy


## Local mode training

In [13]:
!pygmentize mnist_keras_tf.py

[34mimport[39;49;00m [04m[36margparse[39;49;00m, [04m[36mos[39;49;00m
[34mimport[39;49;00m [04m[36mnumpy[39;49;00m [34mas[39;49;00m [04m[36mnp[39;49;00m

[34mimport[39;49;00m [04m[36mtensorflow[39;49;00m [34mas[39;49;00m [04m[36mtf[39;49;00m
[34mfrom[39;49;00m [04m[36mtensorflow[39;49;00m [34mimport[39;49;00m keras
[34mfrom[39;49;00m [04m[36mtensorflow[39;49;00m[04m[36m.[39;49;00m[04m[36mkeras[39;49;00m [34mimport[39;49;00m backend [34mas[39;49;00m K
[34mfrom[39;49;00m [04m[36mtensorflow[39;49;00m[04m[36m.[39;49;00m[04m[36mkeras[39;49;00m[04m[36m.[39;49;00m[04m[36mmodels[39;49;00m [34mimport[39;49;00m Sequential
[34mfrom[39;49;00m [04m[36mtensorflow[39;49;00m[04m[36m.[39;49;00m[04m[36mkeras[39;49;00m[04m[36m.[39;49;00m[04m[36mlayers[39;49;00m [34mimport[39;49;00m Dense, Dropout, Activation, Flatten, BatchNormalization, Conv2D, MaxPooling2D
[34mfrom[39;49;00m [04m[36mtensorflow[39;49;00m[04m[3

In [14]:
from sagemaker.tensorflow import TensorFlow

hyperparams_local={'epochs': 1,
                   'learning-rate': 0.5
                  }

metric_definitions=[
    {'Name': 'train:loss', 'Regex': ' loss: ([0-9\\.]+)'},
    {'Name': 'train:acc', 'Regex': ' accuracy: ([0-9\\.]+)'},
    {'Name': 'val:loss', 'Regex': ' val_loss: ([0-9\\.]+)'},
    {'Name': 'val:acc', 'Regex': ' val_accuracy: ([0-9\\.]+)'}
]

est_local = TensorFlow(entry_point='mnist_keras_tf.py', 
                       role=role,
                       instance_count=1, 
                       instance_type='local',
                       framework_version='2.3', 
                       py_version='py37',
                       output_path=output_path,
                       hyperparameters=hyperparams_local,
                       metric_definitions=metric_definitions
                       )

In [15]:
est_local.fit({'training': training_input_path, 'validation': validation_input_path}) 

INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: tensorflow-training-2021-12-14-15-02-08-567
INFO:sagemaker.local.local_session:Starting training job
INFO:sagemaker.local.image:No AWS credentials found in session but credentials from EC2 Metadata Service are available.
INFO:sagemaker.local.image:docker compose file: 
networks:
  sagemaker-local:
    name: sagemaker-local
services:
  algo-1-pefmo:
    command: train
    container_name: aock5tgxvl-algo-1-pefmo
    environment:
    - '[Masked]'
    - '[Masked]'
    image: 763104351884.dkr.ecr.ap-northeast-2.amazonaws.com/tensorflow-training:2.3-cpu-py37
    networks:
      sagemaker-local:
        aliases:
        - algo-1-pefmo
    stdin_open: true
    tty: true
    volumes:
    - /tmp/tmpweczgk90/algo-1-pefmo/output:/opt/ml/output
    - /tmp/tmpweczgk90/algo-1-pefmo/output/dat

Creating aock5tgxvl-algo-1-pefmo ... 
Creating aock5tgxvl-algo-1-pefmo ... done
Attaching to aock5tgxvl-algo-1-pefmo
[36maock5tgxvl-algo-1-pefmo |[0m 2021-12-14 15:03:09.190421: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.
[36maock5tgxvl-algo-1-pefmo |[0m 2021-12-14 15:03:09.190562: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.
[36maock5tgxvl-algo-1-pefmo |[0m 2021-12-14 15:03:09.219903: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.
[36maock5tgxvl-algo-1-pefmo |[0m 2021-12-14 15:03:10,449 sagemaker-training-toolkit INFO     Imported framework sagemaker_tensorflow_container.training
[36maock5tgxvl-algo-1-pefmo |[0m 2021-12-14 15:03:10,458 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
[36maock

## Experiments

In [16]:
def create_experiment(experiment_name):
    try:
        sm_experiment = Experiment.load(experiment_name)
    except:
        sm_experiment = Experiment.create(experiment_name=experiment_name,
                                          tags=[
                                              {
                                                  'Key': 'modelname',
                                                  'Value': 'fashion-mnist'
                                              },
                                          ])

In [17]:
def create_trial(experiment_name, i_type, i_cnt, spot):
    create_date = strftime("%m%d-%H%M%s")
    
    algo = 'dp'
    
    spot = 's' if spot else 'd'
    i_tag = 'test'
    
    if i_type == 'ml.p3.16xlarge':
        i_tag = 'p3'
    elif i_type == 'ml.p2.8xlarge':
        i_tag = 'p2'
    elif i_type == 'ml.p3dn.24xlarge':
        i_tag = 'p3dn'
    elif i_type == 'ml.p4d.24xlarge':
        i_tag = 'p4d'
    else:
        i_tag = 'others'
        
    trial = "-".join([i_tag,str(i_cnt),algo, spot])
       
    sm_trial = Trial.create(trial_name=f'{experiment_name}-{trial}-{create_date}',
                            experiment_name=experiment_name)

    job_name = f'{sm_trial.trial_name}'
    return job_name

In [39]:
experiment_name = 'fashion-mnist-debugger'
# instance_type = 'ml.p3.2xlarge'
instance_type = 'ml.g4dn.4xlarge'
# instance_type = 'ml.c5.2xlarge'
instance_count = 1
do_spot_training=False

create_experiment(experiment_name)
job_name = create_trial(experiment_name, instance_type, instance_count, do_spot_training)
job_name

'fashion-mnist-debugger-others-1-dp-d-1214-15131639494826'

## Debugger
- https://github.com/aws/amazon-sagemaker-examples/blob/master/sagemaker-debugger/tensorflow2/tensorflow2_zero_code_change/tf2-keras-default-container.ipynb

### Hook

In [40]:
hook_config = DebuggerHookConfig(
    hook_parameters={"save_interval": "100"},
    collection_configs=[
        CollectionConfig("weights"),
        CollectionConfig(name="biases", parameters={"save_interval": "10", "end_step": "500"}),
    ],
)

### Rules

In [41]:
from sagemaker.debugger import Rule, ProfilerRule, rule_configs

rules = [
    Rule.sagemaker(rule_configs.loss_not_decreasing()),
    ProfilerRule.sagemaker(rule_configs.LowGPUUtilization()),
    ProfilerRule.sagemaker(rule_configs.ProfilerReport()),
]

### Profiler

In [42]:
from sagemaker.debugger import ProfilerConfig, FrameworkProfile

profiler_config = ProfilerConfig(
    system_monitor_interval_millis=500,
    framework_profile_params=FrameworkProfile(
        local_path="/opt/ml/output/profiler/", start_step=5, num_steps=10
    ),
)

### Training

In [43]:
hyperparams_managed={'epochs': 100,
                     'learning-rate': 0.05
                    }

# image_uri = f"763104351884.dkr.ecr.{region}.amazonaws.com/tensorflow-training:2.3.1-gpu-py37-cu110-ubuntu18.04"

In [44]:
est_managed = TensorFlow(entry_point='mnist_keras_tf.py', 
                         role=role,
                         instance_count=instance_count, 
                         instance_type=instance_type,
                         framework_version='2.3', 
                         py_version='py37',
                         output_path=output_path,
                         hyperparameters=hyperparams_managed,
                         metric_definitions=metric_definitions,
                         base_job_name='tensorflow-debugger',
                         rules=rules,
                         debugger_hook_config=hook_config,
                         disable_profiler=False, # default: False,
                         profiler_config=profiler_config # Debugger Profiling
                         )

In [45]:
est_managed.fit(inputs={'training': training_input_path, 'validation': validation_input_path},
                experiment_config={
                    'TrialName': job_name,
                    'TrialComponentDisplayName': job_name,
                },
                wait=False)

INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: tensorflow-debugger-2021-12-14-15-13-49-481


In [46]:
job_name=est_managed.latest_training_job.name

In [47]:
sess.logs_for_job(job_name=job_name, wait=True)

2021-12-14 15:13:49 Starting - Starting the training job...
2021-12-14 15:14:17 Starting - Launching requested ML instancesLossNotDecreasing: InProgress
LowGPUUtilization: InProgress
ProfilerReport: InProgress
...
2021-12-14 15:14:44 Starting - Preparing the instances for training.........
2021-12-14 15:16:17 Downloading - Downloading input data
2021-12-14 15:16:17 Training - Downloading the training image.........
2021-12-14 15:17:50 Training - Training image download completed. Training in progress..[34m2021-12-14 15:17:50.943291: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.[0m
[34m2021-12-14 15:17:50.946214: I tensorflow/core/profiler/internal/smprofiler_config_reader.cc:123] PID of the process that is writing to the timeline : 1[0m
[34m2021-12-14 15:17:50.946616: I tensorflow/core/profiler/internal/smprofiler_timeline.cc:121] SageMaker Profiler Timeline Writer read the following config parameters :[0m
[34m2021-12-14 15:

### Download profiler report

In [48]:
profiler_path = './profiler-{}'.format(strftime("%m%d-%H%M%s"))

In [49]:
os.makedirs(profiler_path, exist_ok=True)

In [50]:
rule_output_path = est_managed.output_path + '/' + est_managed.latest_training_job.job_name + "/rule-output"
rule_output_path

's3://sagemaker-ap-northeast-2-889750940888/keras-fashion-mnist-debugger/output/tensorflow-debugger-2021-12-14-15-13-49-481/rule-output'

In [51]:
! aws s3 cp {rule_output_path} {profiler_path} --recursive

download: s3://sagemaker-ap-northeast-2-889750940888/keras-fashion-mnist-debugger/output/tensorflow-debugger-2021-12-14-15-13-49-481/rule-output/ProfilerReport/profiler-output/profiler-reports/CPUBottleneck.json to profiler-1214-15201639495233/ProfilerReport/profiler-output/profiler-reports/CPUBottleneck.json
download: s3://sagemaker-ap-northeast-2-889750940888/keras-fashion-mnist-debugger/output/tensorflow-debugger-2021-12-14-15-13-49-481/rule-output/ProfilerReport/profiler-output/profiler-report.ipynb to profiler-1214-15201639495233/ProfilerReport/profiler-output/profiler-report.ipynb
download: s3://sagemaker-ap-northeast-2-889750940888/keras-fashion-mnist-debugger/output/tensorflow-debugger-2021-12-14-15-13-49-481/rule-output/ProfilerReport/profiler-output/profiler-reports/Dataloader.json to profiler-1214-15201639495233/ProfilerReport/profiler-output/profiler-reports/Dataloader.json
download: s3://sagemaker-ap-northeast-2-889750940888/keras-fashion-mnist-debugger/output/tensorflow-d