# SageMaker Debugger rules

Source
- https://sagemaker.readthedocs.io/en/stable/amazon_sagemaker_debugger.html
- https://gitlab.com/juliensimon/dlnotebooks/blob/master/keras/05-keras-blog-post/Fashion%20MNIST-SageMaker.ipynb
- https://github.com/aws/amazon-sagemaker-examples/blob/master/sagemaker-debugger/tensorflow_profiling/tf-resnet-profiling-single-gpu-single-node.ipynb

In [None]:
# install_needed = True
install_needed = False

In [None]:
import sys
import IPython

if install_needed:
    print("installing deps and restarting kernel")
    !{sys.executable} -m pip install -U 'sagemaker[local]'
    !{sys.executable} -m pip install -U sagemaker-experiments # SageMaker Experiments SDK 
    !{sys.executable} -m pip install -U sagemaker             # SageMaker Python SDK
#     !/bin/bash ./local/local_mode_setup.sh
    IPython.Application.instance().kernel.do_shutdown(True)

In [None]:
# from IPython.display import Image
# Image("fashion-mnist-sprite.png")

In [4]:
import sagemaker

from smexperiments.experiment import Experiment ### SM Experiment
from smexperiments.trial import Trial           ### SM Experiment

from sagemaker.debugger import (
    Rule,
    DebuggerHookConfig,
    TensorBoardOutputConfig,
    CollectionConfig,
    rule_configs,
    ProfilerConfig,
    FrameworkProfile,
    PythonProfiler,
    PythonProfilingConfig,
    cProfileTimer,
    DataloaderProfilingConfig,
    DetailedProfilingConfig,
    ProfilerRule
)

from time import strftime

import boto3

boto_session = boto3.session.Session()
region = boto_session.region_name

sess = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sess.default_bucket()
prefix = 'keras-fashion-mnist-debugger'

output_path = 's3://{}/{}/output'.format(bucket, prefix)
s3_input_path = 's3://{}/{}/data'.format(bucket, prefix)
training_input_path = s3_input_path + '/train'
validation_input_path = s3_input_path + '/val'

## Download the Fashion-MNIST dataset

In [5]:
import os
import numpy as np
import tensorflow as tf

fashion_mnist = tf.keras.datasets.fashion_mnist
(x_train, y_train), (x_val, y_val) = fashion_mnist.load_data()

os.makedirs("./data/train", exist_ok = True)
os.makedirs("./data/val", exist_ok = True)

np.save('./data/train/x_train.npy', x_train)
np.save('./data/train/y_train.npy', y_train)
np.save('./data/val/x_val.npy', x_val)
np.save('./data/val/y_val.npy', y_val)

In [6]:
!ls -lat ./data

total 16
drwxrwxr-x 7 ec2-user ec2-user 4096 Dec 22 03:31 ..
drwxrwxr-x 2 ec2-user ec2-user 4096 Dec 14 15:02 val
drwxrwxr-x 2 ec2-user ec2-user 4096 Dec 14 15:02 train
drwxrwxr-x 4 ec2-user ec2-user 4096 Dec 14 15:02 .


##  Upload Fashion-MNIST data to S3

In [7]:
!aws s3 sync ./data {s3_input_path}

upload: data/val/y_val.npy to s3://sagemaker-ap-northeast-2-889750940888/keras-fashion-mnist-debugger/data/val/y_val.npy
upload: data/train/y_train.npy to s3://sagemaker-ap-northeast-2-889750940888/keras-fashion-mnist-debugger/data/train/y_train.npy
upload: data/val/x_val.npy to s3://sagemaker-ap-northeast-2-889750940888/keras-fashion-mnist-debugger/data/val/x_val.npy
upload: data/train/x_train.npy to s3://sagemaker-ap-northeast-2-889750940888/keras-fashion-mnist-debugger/data/train/x_train.npy


## Training code 준비

In [8]:
!pygmentize mnist_keras_tf.py

[34mimport[39;49;00m [04m[36margparse[39;49;00m, [04m[36mos[39;49;00m
[34mimport[39;49;00m [04m[36mnumpy[39;49;00m [34mas[39;49;00m [04m[36mnp[39;49;00m

[34mimport[39;49;00m [04m[36mtensorflow[39;49;00m [34mas[39;49;00m [04m[36mtf[39;49;00m
[34mfrom[39;49;00m [04m[36mtensorflow[39;49;00m [34mimport[39;49;00m keras
[34mfrom[39;49;00m [04m[36mtensorflow[39;49;00m[04m[36m.[39;49;00m[04m[36mkeras[39;49;00m [34mimport[39;49;00m backend [34mas[39;49;00m K
[34mfrom[39;49;00m [04m[36mtensorflow[39;49;00m[04m[36m.[39;49;00m[04m[36mkeras[39;49;00m[04m[36m.[39;49;00m[04m[36mmodels[39;49;00m [34mimport[39;49;00m Sequential
[34mfrom[39;49;00m [04m[36mtensorflow[39;49;00m[04m[36m.[39;49;00m[04m[36mkeras[39;49;00m[04m[36m.[39;49;00m[04m[36mlayers[39;49;00m [34mimport[39;49;00m Dense, Dropout, Activation, Flatten, BatchNormalization, Conv2D, MaxPooling2D
[34mfrom[39;49;00m [04m[36mtensorflow[39;49;00m[04m[3

## Experiments

In [9]:
def create_experiment(experiment_name):
    try:
        sm_experiment = Experiment.load(experiment_name)
    except:
        sm_experiment = Experiment.create(experiment_name=experiment_name,
                                          tags=[
                                              {
                                                  'Key': 'modelname',
                                                  'Value': 'fashion-mnist'
                                              },
                                          ])

In [10]:
def create_trial(experiment_name, i_type, i_cnt, spot):
    create_date = strftime("%m%d-%H%M%s")
    
    algo = 'dp'
    
    spot = 's' if spot else 'd'
    i_tag = 'test'
    
    if i_type == 'ml.p3.16xlarge':
        i_tag = 'p3'
    elif i_type == 'ml.p2.8xlarge':
        i_tag = 'p2'
    elif i_type == 'ml.p3dn.24xlarge':
        i_tag = 'p3dn'
    elif i_type == 'ml.p4d.24xlarge':
        i_tag = 'p4d'
    else:
        i_tag = 'others'
        
    trial = "-".join([i_tag,str(i_cnt),algo, spot])
       
    sm_trial = Trial.create(trial_name=f'{experiment_name}-{trial}-{create_date}',
                            experiment_name=experiment_name)

    job_name = f'{sm_trial.trial_name}'
    return job_name

In [11]:
experiment_name = 'fashion-mnist-debugger'
# instance_type = 'ml.p3.2xlarge'
instance_type = 'ml.g4dn.4xlarge'
# instance_type = 'ml.c5.2xlarge'
instance_count = 1
do_spot_training=False

create_experiment(experiment_name)
job_name = create_trial(experiment_name, instance_type, instance_count, do_spot_training)
job_name

'fashion-mnist-debugger-others-1-dp-d-1222-03311640143882'

## Debugger
- https://github.com/aws/amazon-sagemaker-examples/blob/master/sagemaker-debugger/tensorflow2/tensorflow2_zero_code_change/tf2-keras-default-container.ipynb

### Hook
- Set the debugger hook config to save tensors

In [12]:
hook_config = DebuggerHookConfig(
    hook_parameters={"save_interval": "100"},
    collection_configs=[
        CollectionConfig("weights"),
        CollectionConfig(name="biases", parameters={"save_interval": "10", "end_step": "500"}),
    ],
)

### Rules
- Set the rules to analyze tensors emitted during training. 
- These specific set of rules will inspect the overall training performance and progress of the model

In [13]:
rules = [
    ProfilerRule.sagemaker(rule_configs.ProfilerReport()),
    Rule.sagemaker(rule_configs.loss_not_decreasing()),
    Rule.sagemaker(rule_configs.overfit()),
    Rule.sagemaker(rule_configs.overtraining()),
    Rule.sagemaker(rule_configs.stalled_training_rule())
]

### Profiler
- Configure Debugger Framework Profiling: https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-configure-framework-profiling.html

In [14]:
profiler_config = ProfilerConfig(
    system_monitor_interval_millis = 500,
    framework_profile_params = FrameworkProfile(
        detailed_profiling_config = DetailedProfilingConfig(
            start_step = 5, 
            num_steps = 10
        ),
        dataloader_profiling_config = DataloaderProfilingConfig(
            start_step = 7, 
            num_steps = 10
        ),
        python_profiling_config = PythonProfilingConfig(
            start_step = 9, 
            num_steps = 10,
            python_profiler = PythonProfiler.CPROFILE, 
            cprofile_timer = cProfileTimer.TOTAL_TIME
        )
    )
)

## Training

In [15]:
from sagemaker.tensorflow import TensorFlow

metric_definitions=[
    {'Name': 'train:loss', 'Regex': ' loss: ([0-9\\.]+)'},
    {'Name': 'train:acc', 'Regex': ' accuracy: ([0-9\\.]+)'},
    {'Name': 'val:loss', 'Regex': ' val_loss: ([0-9\\.]+)'},
    {'Name': 'val:acc', 'Regex': ' val_accuracy: ([0-9\\.]+)'}
]

hyperparams_managed={'epochs': 100,
                     'learning-rate': 0.05
                    }

In [16]:
est_managed = TensorFlow(entry_point='mnist_keras_tf.py', 
                         role=role,
                         instance_count=instance_count, 
                         instance_type=instance_type,
                         framework_version='2.3', 
                         py_version='py37',
                         output_path=output_path,
                         hyperparameters=hyperparams_managed,
                         metric_definitions=metric_definitions,
                         base_job_name='tensorflow-debugger',
                         rules=rules,
#                          debugger_hook_config=hook_config,
#                          disable_profiler=False, # default: False,
#                          profiler_config=profiler_config # Debugger Profiling
                         )

In [17]:
est_managed.fit(inputs={'training': training_input_path, 'validation': validation_input_path},
                experiment_config={
                    'TrialName': job_name,
                    'TrialComponentDisplayName': job_name,
                },
                wait=False)

INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: tensorflow-debugger-2021-12-22-03-31-22-518


In [18]:
job_name=est_managed.latest_training_job.name

In [19]:
sess.logs_for_job(job_name=job_name, wait=True)

2021-12-22 03:31:23 Starting - Starting the training job...
2021-12-22 03:31:46 Starting - Launching requested ML instancesLossNotDecreasing: InProgress
Overfit: InProgress
Overtraining: InProgress
StalledTrainingRule: InProgress
ProfilerReport: InProgress
......
2021-12-22 03:32:46 Starting - Preparing the instances for training......
2021-12-22 03:33:49 Downloading - Downloading input data
2021-12-22 03:33:49 Training - Downloading the training image............
2021-12-22 03:35:47 Training - Training image download completed. Training in progress.[34m2021-12-22 03:35:40.773687: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.[0m
[34m2021-12-22 03:35:40.777026: I tensorflow/core/profiler/internal/smprofiler_config_reader.cc:123] PID of the process that is writing to the timeline : 1[0m
[34m2021-12-22 03:35:40.777472: I tensorflow/core/profiler/internal/smprofiler_timeline.cc:121] SageMaker Profiler Timeline Writer read the foll

### Download profiler report

In [20]:
profiler_path = './profiler-{}'.format(strftime("%m%d-%H%M%s"))

In [21]:
os.makedirs(profiler_path, exist_ok=True)

In [22]:
rule_output_path = est_managed.output_path + '/' + est_managed.latest_training_job.job_name + "/rule-output"
rule_output_path

's3://sagemaker-ap-northeast-2-889750940888/keras-fashion-mnist-debugger/output/tensorflow-debugger-2021-12-22-03-31-22-518/rule-output'

In [23]:
! aws s3 cp {rule_output_path} {profiler_path} --recursive

download: s3://sagemaker-ap-northeast-2-889750940888/keras-fashion-mnist-debugger/output/tensorflow-debugger-2021-12-22-03-31-22-518/rule-output/ProfilerReport/profiler-output/profiler-report.ipynb to profiler-1222-03381640144317/ProfilerReport/profiler-output/profiler-report.ipynb
download: s3://sagemaker-ap-northeast-2-889750940888/keras-fashion-mnist-debugger/output/tensorflow-debugger-2021-12-22-03-31-22-518/rule-output/ProfilerReport/profiler-output/profiler-reports/Dataloader.json to profiler-1222-03381640144317/ProfilerReport/profiler-output/profiler-reports/Dataloader.json
download: s3://sagemaker-ap-northeast-2-889750940888/keras-fashion-mnist-debugger/output/tensorflow-debugger-2021-12-22-03-31-22-518/rule-output/ProfilerReport/profiler-output/profiler-reports/LoadBalancing.json to profiler-1222-03381640144317/ProfilerReport/profiler-output/profiler-reports/LoadBalancing.json
download: s3://sagemaker-ap-northeast-2-889750940888/keras-fashion-mnist-debugger/output/tensorflow-d