## Overview

In [35]:
%pip install pandas==1.1.5
%pip install numpy --upgrade
%pip install jinja2==3.0
%pip install smdebug
%pip install sagemaker==2.95.0

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.1.2[0m[39;49m -> [0m[32;49m22.2.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.1.2[0m[39;49m -> [0m[32;49m22.2.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.1.2[0m[39;49m -> [0m[32;49m22.2.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
[0m
[1m[[0m[34;49mnotice[0m[1;39;4

In [2]:
import pandas as pd
import glob
import numpy as np
import sys
import pyarrow
import sagemaker
import os
import boto3
import smdebug
from sagemaker.debugger import Rule, rule_configs, ProfilerConfig, ProfilerRule, FrameworkProfile, DetailedProfilingConfig, DataloaderProfilingConfig, PythonProfilingConfig, PythonProfiler, cProfileTimer, CollectionConfig, DebuggerHookConfig



In [3]:
region = boto3.Session().region_name
role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session()
bucket_name = sagemaker.Session().default_bucket()

prefix = 'ipinyou-xgboost'
os.environ["AWS_REGION"] = region

print(f'Region : {region}')
print(f'IAM Role : {role}')
print(f'S3 Bucket : {bucket_name}')

# Get the current installed version of Sagemaker SDK, TensorFlow, Python, Boto3 and SMDebug
print(f'SageMaker Python SDK version : {sagemaker.__version__}')
print(f'Python version : {sys.version}')
print(f'Boto3 version : {boto3.__version__}')
print(f'SMDebug version : {smdebug.__version__}')

Region : us-east-1
IAM Role : arn:aws:iam::431615879134:role/sagemaker-test-role
S3 Bucket : sagemaker-us-east-1-431615879134
SageMaker Python SDK version : 2.95.0
Python version : 3.7.10 (default, Jun  4 2021, 14:48:32) 
[GCC 7.5.0]
Boto3 version : 1.24.12
SMDebug version : 1.0.12


# Data

In [4]:
# upload data to s3
sagemaker_session.upload_data('train.csv',bucket=bucket_name,key_prefix=f'{prefix}/data/train')
sagemaker_session.upload_data('test.csv',bucket=bucket_name,key_prefix=f'{prefix}/data/test')

's3://sagemaker-us-east-1-431615879134/ipenyou-xgboost/data/test/test.csv'

# Training

## Profiling | Debugger

https://github.com/aws-samples/amazon-sagemaker-script-mode-with-debugger

### Set up the debugger

1. Profile Config - configure how to collect system metrics and framework metrics from your training job and save into your S3 bucket or local machine.
1. Debugger Hook Config - configure how to collect output tensors from your training job and save them into your S3 bucket or local machine
1. Rules - configure debugger built-in rules that you want to run in parallel.  These rules look for common training issues.  The ProfilerReport saves the debugging profiling reports into your S3 bucket.


In [5]:
container = sagemaker.image_uris.retrieve('xgboost',boto3.Session().region_name,version='1.2-2')
print(container)

683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.2-2


In [6]:
s3_input_train = sagemaker.inputs.TrainingInput(s3_data=f's3://{bucket_name}/{prefix}/data/train', content_type='csv')
s3_input_test = sagemaker.inputs.TrainingInput(s3_data=f's3://{bucket_name}/{prefix}/data/test', content_type='csv')

In [7]:
rules=[
    Rule.sagemaker(rule_configs.create_xgboost_report())
]

In [8]:
hyperparameters = {
    "max_depth": "10",
    "eta": "0.2",
    "objective": "binary:logistic",
    "num_round": "50",
    "eval_metric": "logloss"
}

output_path = f's3://{bucket_name}/{prefix}/output/'

rules=[
    Rule.sagemaker(rule_configs.create_xgboost_report())  # note rule does not work with xgb version 1.3-x or greater
]

# construct a SageMaker estimator that calls the xgboost-container
estimator = sagemaker.estimator.Estimator(image_uri=container, 
                                          hyperparameters=hyperparameters,
                                          role=role,
                                          instance_count=1, 
                                          instance_type='ml.m5.2xlarge', 
                                          volume_size=5, # 5 GB 
                                          output_path=output_path,
                                          rules=rules)

In [9]:
estimator.fit({'train': s3_input_test, 'validation': s3_input_test})

2022-08-09 21:20:50 Starting - Starting the training job...
2022-08-09 21:21:19 Starting - Preparing the instances for trainingCreateXgboostReport: InProgress
ProfilerReport-1660080050: InProgress
.........
2022-08-09 21:22:39 Downloading - Downloading input data...
2022-08-09 21:23:19 Training - Downloading the training image...
2022-08-09 21:23:50 Uploading - Uploading generated training model.[34m[2022-08-09 21:23:43.413 ip-10-0-236-64.ec2.internal:1 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2022-08-09:21:23:43:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2022-08-09:21:23:43:INFO] Failed to parse hyperparameter eval_metric value logloss to Json.[0m
[34mReturning the value itself[0m
[34m[2022-08-09:21:23:43:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34m[2022-08-09:21:23:43:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2022-08-09:21:23:43:

UnexpectedStatusException: Error for Training job sagemaker-xgboost-2022-08-09-21-20-50-642: Failed. Reason: AlgorithmError: framework error: 
Traceback (most recent call last):
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_containers/_trainer.py", line 84, in train
    entrypoint()
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_xgboost_container/training.py", line 94, in main
    train(framework.training_env())
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_xgboost_container/training.py", line 90, in train
    run_algorithm_mode()
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_xgboost_container/training.py", line 68, in run_algorithm_mode
    checkpoint_config=checkpoint_config
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_xgboost_container/algorithm_mode/train.py", line 141, in sagemaker_train
    csv_weights, is_pipe, combine_train_val)
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_xgboost_container/algorithm_mode/train.py", line 89, in get_validated_dmatrices
    if train_files_size > 0 else None
  File "/miniconda3/lib/python3.7/site-packages/

In order to make sure that our code works for inference, we can deploy the trained model and execute some inferences.

* https://github.com/aws-samples/sagemaker-end-to-end-distributed-tensorflow2
* https://github.com/aws-samples/amazon-sagemaker-script-mode/blob/master/tf-distribution-options/tf-distributed-training.ipynb
* 

## View the debugger profiling report

Note that the training time listed in the report can be slightly different than the time reported above.  

In [100]:
estimator.profiler_config.s3_output_path

's3://sagemaker-us-east-1-431615879134/'

In [94]:
estimator.output_path

's3://sagemaker-us-east-1-431615879134/'

In [93]:
estimator.latest_training_job.job_name

'tensorflow2-profile-ipenyou-csv-2022-08-02-20-52-36-195'

In [None]:
sagemaker_session.download_data()

In [None]:
# Get the S3 path to the debugger's auto-generated profiling report
profiling_report_s3_prefix = f'{estimator.output_path}/{output/{estimator.latest_training_job.job_name}/rule-output/ProfilerReport/profiler-output/profiler-report.html'
profiling_report = sagemaker.Session().read_s3_file(s3_bucket, profiling_report_s3_prefix)

# Print debugger's auto-generated profiling report location
display(HTML(profiling_report))

In [None]:
# This assumes that the job was trained in the same AWS region as the S3 bucket where the debugger output is stored
# If not, then make appropriate changes to the following code

from smdebug.profiler.analysis.notebook_utils.training_job import TrainingJob
tj = TrainingJob(estimator.latest_training_job.job_name, sagemaker_session.boto_region_name)

In [None]:
from jinja2 import Environment, Markup

In [111]:
import jinja2

In [112]:
jinja2.__version__

'3.1.2'

In [None]:
! aws s3 ls {tj.profiler_s3_output_path} --recursive | grep '\.json$'

In [None]:
# get the report name and path
print(f'Output Path = {estimator.output_path}')
print(f'Training Job Name = {estimator.latest_training_job.job_name}')

In [None]:
rule_output_path = estimator.output_path + estimator.latest_training_job.job_name + "/rule-output"

In [None]:
!aws s3 cp {rule_output_path} ./profile/ --recursive