# SageMaker Debugger rules

Source
- https://sagemaker.readthedocs.io/en/stable/amazon_sagemaker_debugger.html
- https://gitlab.com/juliensimon/dlnotebooks/blob/master/keras/05-keras-blog-post/Fashion%20MNIST-SageMaker.ipynb

In [65]:
install_needed = True
# install_needed = False

In [66]:
import sys
import IPython

if install_needed:
    print("installing deps and restarting kernel")
    !{sys.executable} -m pip install -U 'sagemaker[local]'
    !{sys.executable} -m pip install -U sagemaker-experiments # SageMaker Experiments SDK 
    !{sys.executable} -m pip install -U sagemaker             # SageMaker Python SDK
#     !/bin/bash ./local/local_mode_setup.sh
    IPython.Application.instance().kernel.do_shutdown(True)

installing deps and restarting kernel
Collecting sagemaker[local]
  Using cached sagemaker-2.70.0-py2.py3-none-any.whl
Collecting boto3>=1.20.18
  Downloading boto3-1.20.23-py3-none-any.whl (131 kB)
[K     |████████████████████████████████| 131 kB 1.7 MB/s eta 0:00:01
Collecting botocore<1.24.0,>=1.23.23
  Downloading botocore-1.23.23-py3-none-any.whl (8.4 MB)
[K     |████████████████████████████████| 8.4 MB 9.7 MB/s eta 0:00:01
Installing collected packages: botocore, boto3, sagemaker
  Attempting uninstall: botocore
    Found existing installation: botocore 1.23.4
    Uninstalling botocore-1.23.4:
      Successfully uninstalled botocore-1.23.4
  Attempting uninstall: boto3
    Found existing installation: boto3 1.20.4
    Uninstalling boto3-1.20.4:
      Successfully uninstalled boto3-1.20.4
  Attempting uninstall: sagemaker
    Found existing installation: sagemaker 2.68.0
    Uninstalling sagemaker-2.68.0:
      Successfully uninstalled sagemaker-2.68.0
[31mERROR: pip's dependen

In [None]:
from IPython.display import Image
Image("fashion-mnist-sprite.png")

In [14]:
import sagemaker

from smexperiments.experiment import Experiment ### SM Experiment
from smexperiments.trial import Trial           ### SM Experiment

from sagemaker.debugger import (
    Rule,
    DebuggerHookConfig,
    TensorBoardOutputConfig,
    CollectionConfig,
    rule_configs,
    ProfilerRule
)

from time import strftime

sess = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sess.default_bucket()
prefix = 'keras-fashion-mnist-debugger'

output_path = 's3://{}/{}/output'.format(bucket, prefix)
s3_input_path = 's3://{}/{}/data'.format(bucket, prefix)
training_input_path = s3_input_path + '/train'
validation_input_path = s3_input_path + '/val'

## Download the Fashion-MNIST dataset

In [3]:
import os
import keras
import numpy as np
from keras.datasets import fashion_mnist
(x_train, y_train), (x_val, y_val) = fashion_mnist.load_data()

os.makedirs("./data/train", exist_ok = True)
os.makedirs("./data/val", exist_ok = True)

np.save('./data/train/x_train.npy', x_train)
np.save('./data/train/y_train.npy', y_train)
np.save('./data/val/x_val.npy', x_val)
np.save('./data/val/y_val.npy', y_val)

Using TensorFlow backend.


In [4]:
!ls -lat ./data

total 20
drwxrwxr-x 4 ec2-user ec2-user 4096 Dec 10 01:31 ..
drwxrwxr-x 2 ec2-user ec2-user 4096 Dec 10 01:01 val
drwxrwxr-x 2 ec2-user ec2-user 4096 Dec 10 01:01 train
drwxrwxr-x 5 ec2-user ec2-user 4096 Dec 10 01:01 .
drwxrwxr-x 2 ec2-user ec2-user 4096 Dec 10 00:57 .ipynb_checkpoints


##  Upload Fashion-MNIST data to S3

In [5]:
!aws s3 sync ./data {s3_input_path}

upload: data/val/y_val.npy to s3://sagemaker-ap-northeast-2-889750940888/keras-fashion-mnist-debugger/data/val/y_val.npy
upload: data/train/y_train.npy to s3://sagemaker-ap-northeast-2-889750940888/keras-fashion-mnist-debugger/data/train/y_train.npy
upload: data/val/x_val.npy to s3://sagemaker-ap-northeast-2-889750940888/keras-fashion-mnist-debugger/data/val/x_val.npy
upload: data/train/x_train.npy to s3://sagemaker-ap-northeast-2-889750940888/keras-fashion-mnist-debugger/data/train/x_train.npy


## Local mode training

In [6]:
!pygmentize mnist_keras_tf.py

[34mimport[39;49;00m [04m[36margparse[39;49;00m, [04m[36mos[39;49;00m
[34mimport[39;49;00m [04m[36mnumpy[39;49;00m [34mas[39;49;00m [04m[36mnp[39;49;00m

[34mimport[39;49;00m [04m[36mtensorflow[39;49;00m [34mas[39;49;00m [04m[36mtf[39;49;00m
[34mimport[39;49;00m [04m[36mkeras[39;49;00m
[34mfrom[39;49;00m [04m[36mkeras[39;49;00m [34mimport[39;49;00m backend [34mas[39;49;00m K
[34mfrom[39;49;00m [04m[36mkeras[39;49;00m[04m[36m.[39;49;00m[04m[36mmodels[39;49;00m [34mimport[39;49;00m Sequential
[34mfrom[39;49;00m [04m[36mkeras[39;49;00m[04m[36m.[39;49;00m[04m[36mlayers[39;49;00m [34mimport[39;49;00m Dense, Dropout, Activation, Flatten, BatchNormalization, Conv2D, MaxPooling2D
[34mfrom[39;49;00m [04m[36mkeras[39;49;00m[04m[36m.[39;49;00m[04m[36moptimizers[39;49;00m [34mimport[39;49;00m SGD
[34mfrom[39;49;00m [04m[36mkeras[39;49;00m[04m[36m.[39;49;00m[04m[36mutils[39;49;00m [34mimport[39;49;00m mult

In [27]:
from sagemaker.tensorflow import TensorFlow

hyperparams_local={'epochs': 1,
                   'learning-rate': 0.5
                  }

metric_definitions=[
    {'Name': 'train:loss', 'Regex': ' loss: ([0-9\\.]+)'},
    {'Name': 'train:acc', 'Regex': ' accuracy: ([0-9\\.]+)'},
    {'Name': 'val:loss', 'Regex': ' val_loss: ([0-9\\.]+)'},
    {'Name': 'val:acc', 'Regex': ' val_accuracy: ([0-9\\.]+)'}
]

est_local = TensorFlow(entry_point='mnist_keras_tf.py', 
                       role=role,
                       instance_count=1, 
                       instance_type='local',
                       framework_version='2.1', 
                       py_version='py3',
                       output_path=output_path,
                       hyperparameters=hyperparams_local,
                       metric_definitions=metric_definitions
                       )

In [28]:
est_local.fit({'training': training_input_path, 'validation': validation_input_path}) 

INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: tensorflow-training-2021-12-10-01-40-52-533
INFO:sagemaker.local.local_session:Starting training job
INFO:sagemaker.local.image:No AWS credentials found in session but credentials from EC2 Metadata Service are available.
INFO:sagemaker.local.image:docker compose file: 
networks:
  sagemaker-local:
    name: sagemaker-local
services:
  algo-1-inz3f:
    command: train
    container_name: faf3gttjnl-algo-1-inz3f
    environment:
    - '[Masked]'
    - '[Masked]'
    image: 763104351884.dkr.ecr.ap-northeast-2.amazonaws.com/tensorflow-training:2.1-cpu-py3
    networks:
      sagemaker-local:
        aliases:
        - algo-1-inz3f
    stdin_open: true
    tty: true
    volumes:
    - /tmp/tmp6om5a0du/algo-1-inz3f/input:/opt/ml/input
    - /tmp/tmp6om5a0du/algo-1-inz3f/output/data:/

Creating faf3gttjnl-algo-1-inz3f ... 
Creating faf3gttjnl-algo-1-inz3f ... done
Attaching to faf3gttjnl-algo-1-inz3f
[36mfaf3gttjnl-algo-1-inz3f |[0m 2021-12-10 01:40:57,497 sagemaker-training-toolkit INFO     Imported framework sagemaker_tensorflow_container.training
[36mfaf3gttjnl-algo-1-inz3f |[0m 2021-12-10 01:40:57,503 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
[36mfaf3gttjnl-algo-1-inz3f |[0m 2021-12-10 01:40:57,712 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
[36mfaf3gttjnl-algo-1-inz3f |[0m 2021-12-10 01:40:57,729 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
[36mfaf3gttjnl-algo-1-inz3f |[0m 2021-12-10 01:40:57,744 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
[36mfaf3gttjnl-algo-1-inz3f |[0m 2021-12-10 01:40:57,755 sagemaker-training-toolkit INFO     Invoking user script
[36mfaf3gttjnl-algo-1-inz3f |[0m 
[36mfaf3

## Experiments

In [29]:
def create_experiment(experiment_name):
    try:
        sm_experiment = Experiment.load(experiment_name)
    except:
        sm_experiment = Experiment.create(experiment_name=experiment_name,
                                          tags=[
                                              {
                                                  'Key': 'modelname',
                                                  'Value': 'fashion-mnist'
                                              },
                                          ])

In [30]:
def create_trial(experiment_name, i_type, i_cnt, spot):
    create_date = strftime("%m%d-%H%M%s")
    
    algo = 'dp'
    
    spot = 's' if spot else 'd'
    i_tag = 'test'
    
    if i_type == 'ml.p3.16xlarge':
        i_tag = 'p3'
    elif i_type == 'ml.p2.8xlarge':
        i_tag = 'p2'
    elif i_type == 'ml.p3dn.24xlarge':
        i_tag = 'p3dn'
    elif i_type == 'ml.p4d.24xlarge':
        i_tag = 'p4d'
    else:
        i_tag = 'others'
        
    trial = "-".join([i_tag,str(i_cnt),algo, spot])
       
    sm_trial = Trial.create(trial_name=f'{experiment_name}-{trial}-{create_date}',
                            experiment_name=experiment_name)

    job_name = f'{sm_trial.trial_name}'
    return job_name

In [31]:
experiment_name = 'fashion-mnist-debugger'
instance_type = 'ml.p3.2xlarge'
instance_count = 1
do_spot_training=False

create_experiment(experiment_name)
job_name = create_trial(experiment_name, instance_type, instance_count, do_spot_training)
job_name

'fashion-mnist-debugger-others-1-dp-d-1210-01411639100465'

## Debugger
- https://github.com/aws/amazon-sagemaker-examples/blob/master/sagemaker-debugger/tensorflow2/tensorflow2_zero_code_change/tf2-keras-default-container.ipynb

### Hook

In [56]:
hook_config = DebuggerHookConfig(
    hook_parameters={"save_interval": "100"},
    collection_configs=[
        CollectionConfig("weights"),
        CollectionConfig(name="biases", parameters={"save_interval": "10", "end_step": "500"}),
    ],
)

### Rules

In [57]:
rules = [
    Rule.sagemaker(rule_configs.vanishing_gradient()),
    Rule.sagemaker(rule_configs.loss_not_decreasing()),
    ProfilerRule.sagemaker(rule_configs.ProfilerReport()),
]

### Profiler

In [58]:
from sagemaker.debugger import ProfilerConfig, FrameworkProfile

profiler_config = ProfilerConfig(
    system_monitor_interval_millis=500,
    framework_profile_params=FrameworkProfile(num_steps=10)
)

### Training

In [60]:
hyperparams_managed={'epochs': 500,
                     'learning-rate': 0.05
                    }

In [61]:
est_managed = TensorFlow(entry_point='mnist_keras_tf.py', 
                         role=role,
                         instance_count=instance_count, 
                         instance_type=instance_type,
                         framework_version='2.1', 
                         py_version='py3',
                         output_path=output_path,
                         hyperparameters=hyperparams_managed,
                         metric_definitions=metric_definitions,
                         base_job_name='tensorflow-debugger',
#                          rules=rules,
#                          debugger_hook_config=hook_config,
                         disable_profiler=False # default: False
                         )

In [62]:
est_managed.fit(inputs={'training': training_input_path, 'validation': validation_input_path},
                experiment_config={
                    'TrialName': job_name,
                    'TrialComponentDisplayName': job_name,
                },
                wait=False)

INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: tensorflow-debugger-2021-12-10-02-38-05-880


### Download profiler report

In [50]:
profiler_path = './profiler-{}'.format(strftime("%m%d-%H%M%s"))

In [52]:
os.makedirs(profiler_path, exist_ok=True)

In [53]:
rule_output_path = est_managed.output_path + '/' + est_managed.latest_training_job.job_name + "/rule-output"
rule_output_path

's3://sagemaker-ap-northeast-2-889750940888/keras-fashion-mnist-debugger/output/tensorflow-debugger-2021-12-10-02-17-49-503/rule-output'

In [54]:
! aws s3 cp {rule_output_path} {profiler_path} --recursive

download: s3://sagemaker-ap-northeast-2-889750940888/keras-fashion-mnist-debugger/output/tensorflow-debugger-2021-12-10-02-17-49-503/rule-output/ProfilerReport/profiler-output/profiler-report.ipynb to profiler-1210-02271639103253/ProfilerReport/profiler-output/profiler-report.ipynb
download: s3://sagemaker-ap-northeast-2-889750940888/keras-fashion-mnist-debugger/output/tensorflow-debugger-2021-12-10-02-17-49-503/rule-output/ProfilerReport/profiler-output/profiler-reports/LowGPUUtilization.json to profiler-1210-02271639103253/ProfilerReport/profiler-output/profiler-reports/LowGPUUtilization.json
download: s3://sagemaker-ap-northeast-2-889750940888/keras-fashion-mnist-debugger/output/tensorflow-debugger-2021-12-10-02-17-49-503/rule-output/ProfilerReport/profiler-output/profiler-reports/Dataloader.json to profiler-1210-02271639103253/ProfilerReport/profiler-output/profiler-reports/Dataloader.json
download: s3://sagemaker-ap-northeast-2-889750940888/keras-fashion-mnist-debugger/output/tens