# XGBoost 학습 후 Batch Transform w/ SM Processing
- Training: Managed training
- Processing: Local mode

In [43]:
install_needed = True  # should only be True once
# install_needed = False

In [44]:
import sys
import IPython

if install_needed:
    print("installing deps and restarting kernel")
    !{sys.executable} -m pip install -U 'sagemaker[local]'
    !{sys.executable} -m pip install -U sagemaker-experiments # SageMaker Experiments SDK 
    !{sys.executable} -m pip install -U sagemaker             # SageMaker Python SDK
    !{sys.executable} -m pip install -U boto3
    
    IPython.Application.instance().kernel.do_shutdown(True)
# !pip install xgboost

installing deps and restarting kernel
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com


In [1]:
import os
import pandas as pd

import sagemaker

import time
from time import gmtime, strftime 

## Prepare & Uplad dataset

In [2]:
os.makedirs('./data', exist_ok=True)

In [3]:
from sklearn.datasets import load_boston

boston = load_boston()

data = pd.DataFrame(boston.data)
data.columns = boston.feature_names
data['PRICE'] = boston.target

data.to_csv('./data/boston.csv', sep=',', index=False)

In [4]:
session = sagemaker.Session()
bucket = session.default_bucket()
prefix = 'batch-transform/xgboost-byos'
output_path = f"s3://{bucket}/{prefix}/output"

role = sagemaker.get_execution_role()
role

'arn:aws:iam::889750940888:role/sinjoonk-sagemaker-demo-role'

In [5]:
input_data = session.upload_data('./data', key_prefix=f'{prefix}/data')
input_data

's3://sagemaker-ap-northeast-2-889750940888/batch-transform/xgboost-byos/data'

In [6]:
!aws s3 ls {input_data} --recursive

2021-12-14 07:45:30      41085 batch-transform/xgboost-byos/data/.ipynb_checkpoints/boston-checkpoint.csv
2021-12-14 07:45:30      39170 batch-transform/xgboost-byos/data/boston.csv


In [7]:
%%writefile source_dir/train.py

import xgboost as xgb

import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston

import pickle as pkl
import argparse
import os

parser = argparse.ArgumentParser()

# Hyperparameters are described here.
parser.add_argument("--max_depth", type=int, default=5)
parser.add_argument("--eta", type=float, default=0.2)
parser.add_argument("--gamma", type=int, default=4)
parser.add_argument("--min_child_weight", type=int, default=6)
parser.add_argument("--subsample", type=float, default=0.7)
parser.add_argument("--verbosity", type=int, default=2)
parser.add_argument("--objective", type=str, default='reg:squarederror')
parser.add_argument("--num_round", type=int, default=50)
parser.add_argument("--tree_method", type=str, default="auto")
parser.add_argument("--predictor", type=str, default="auto")

# SageMaker specific arguments. Defaults are set in the environment variables.
parser.add_argument('--model_dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN'])
# parser.add_argument('--validation', type=str, default=os.environ['SM_CHANNEL_VALIDATION'])

args = parser.parse_args()

data = pd.read_csv(f'{args.train}/boston.csv')
X, y = data.iloc[:,:-1], data.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

train = xgb.DMatrix(X_train, y_train)
test = xgb.DMatrix(X_test, y_test)

train_hp = {
    "max_depth": args.max_depth,
    "eta": args.eta,
    "gamma": args.gamma,
    "min_child_weight": args.min_child_weight,
    "subsample": args.subsample,
    "verbosity": args.verbosity,
    "objective": args.objective,
    "tree_method": args.tree_method,
    "predictor": args.predictor,
}

model_xgb = xgb.train(params=train_hp, 
                      dtrain=train,
                      evals=[(train, "train"), (test, "validation")], 
                      num_boost_round=100,
                      early_stopping_rounds=20)

preds = model_xgb.predict(test)

rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

model_xgb.save_model(f'{args.model_dir}/model.json')

Overwriting source_dir/train.py


## Local mode training

In [8]:
hyperparameters_local = {
    "max_depth": "5",
    "eta": "0.2",
    "gamma": "4",
    "min_child_weight": "6",
    "subsample": "0.7",
    "objective": "reg:squarederror",
    "num_round": "50",
    "verbosity": "2",
}

In [9]:
aaa = sagemaker.get_execution_role()
aaa

'arn:aws:iam::889750940888:role/sinjoonk-sagemaker-demo-role'

In [14]:
from sagemaker.xgboost.estimator import XGBoost

# from sagemaker.local import LocalSession
# sagemaker_session = LocalSession()

use_spot_instances = False
job_name = "DEMO-xgboost-regression-" + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())
print("Training job", job_name)

xgb_script_mode_estimator_local = XGBoost(
    entry_point="./source_dir/train.py",
#     source_dir='source_dir',
    hyperparameters=hyperparameters_local,
#     role=role,
    role='dummyrole',
    instance_count=1,
    instance_type='local',
    framework_version='1.3-1',
    output_path=output_path,
    base_job_name='xgboost-batch-transform',
#     sagemaker_session=sagemaker_session
#     checkpoint_s3_uri=checkpoint_s3_uri,
)

Training job DEMO-xgboost-regression-2021-12-14-07-46-06


In [15]:
input_data_local = './data'

In [16]:
xgb_script_mode_estimator_local.fit(
    {'train': 'file://{}'.format(input_data_local)}
)

ClientError: An error occurred (AccessDenied) when calling the GetRole operation: User: arn:aws:sts::889750940888:assumed-role/sinjoonk-sagemaker-demo-role/SageMaker is not authorized to perform: iam:GetRole on resource: role dummyrole

## Managed training w/ Spot

In [18]:
hyperparameters_managed = {
    "max_depth": "5",
    "eta": "0.2",
    "gamma": "4",
    "min_child_weight": "6",
    "subsample": "0.7",
    "objective": "reg:squarederror",
    "num_round": "500",                    ### Increasing num_round
    "verbosity": "2",
}

In [19]:
from sagemaker.xgboost.estimator import XGBoost

use_spot_instances = True
max_run = 60*60
max_wait = 60*60

job_name = "DEMO-xgboost-regression-" + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())
print("Training job", job_name)
checkpoint_s3_uri = (
    "s3://{}/{}/checkpoints/{}".format(bucket, prefix, job_name) if use_spot_instances else None
)
print("Checkpoint path:", checkpoint_s3_uri)

xgb_script_mode_estimator_managed = XGBoost(
    entry_point="train.py",
    source_dir='source_dir',
    hyperparameters=hyperparameters_managed,
    role=role,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    framework_version='1.3-1',
    output_path=output_path,
    use_spot_instances=use_spot_instances,
    max_run=max_run,
    max_wait=max_wait,
    base_job_name='xgboost-batch-transform',
    checkpoint_s3_uri=checkpoint_s3_uri,
)

Training job DEMO-xgboost-regression-2021-12-14-07-26-01
Checkpoint path: s3://sagemaker-ap-northeast-2-889750940888/batch-transform/xgboost-byos/checkpoints/DEMO-xgboost-regression-2021-12-14-07-26-01


In [20]:
xgb_script_mode_estimator_managed.fit(
    {'train': input_data}
)

2021-12-14 07:26:02 Starting - Starting the training job...
2021-12-14 07:26:27 Starting - Launching requested ML instancesProfilerReport-1639466762: InProgress
......
2021-12-14 07:27:28 Starting - Preparing the instances for training............
2021-12-14 07:29:28 Downloading - Downloading input data
2021-12-14 07:29:28 Training - Training image download completed. Training in progress..[34m[2021-12-14 07:29:30.716 ip-10-0-107-91.ap-northeast-2.compute.internal:1 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2021-12-14:07:29:30:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2021-12-14:07:29:30:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2021-12-14:07:29:30:INFO] Invoking user training script.[0m
[34m[2021-12-14:07:29:31:INFO] Module train does not provide a setup.py. [0m
[34mGenerating setup.py[0m
[34m[2021-12-14:07:29:31:INFO] Generating setup.cfg[0m
[34m[2021-12-14:07:29:31:INFO] Generating MANIFEST.in[0m
[34

In [21]:
!aws s3 cp s3://sagemaker-ap-northeast-2-889750940888/batch-transform/xgboost-byos/output/xgboost-batch-transform-2021-12-11-14-35-53-183/model.tar.gz .

download: s3://sagemaker-ap-northeast-2-889750940888/batch-transform/xgboost-byos/output/xgboost-batch-transform-2021-12-11-14-35-53-183/model.tar.gz to ./model.tar.gz


### #TODO Local deploy
https://github.com/aws/amazon-sagemaker-examples/blob/master/introduction_to_amazon_algorithms/xgboost_abalone/xgboost_inferenece_script_mode.ipynb

In [22]:
# from sagemaker.xgboost.model import XGBoostModel

# model_data = xgb_script_mode_estimator.model_data
# print(model_data)

# xgb_inference_model = XGBoostModel(
#     model_data=model_data,
#     role=role,
#     entry_point="inference.py",
#     source_dir='source_dir',
#     framework_version="1.3-1",
# )

In [23]:
# predictor = xgb_inference_model.deploy(
#     initial_instance_count=1,
# #     instance_type="ml.c5.xlarge",
#     instance_type='local'
# )

## Processing for Batch transform

In [24]:
model_artifacts = xgb_script_mode_estimator_managed.model_data
model_artifacts

's3://sagemaker-ap-northeast-2-889750940888/batch-transform/xgboost-byos/output/xgboost-batch-transform-2021-12-14-07-26-02-537/output/model.tar.gz'

In [25]:
input_data

's3://sagemaker-ap-northeast-2-889750940888/batch-transform/xgboost-byos/data'

In [26]:
%%writefile source_dir/preprocessing.py

import sys
import subprocess
import logging

# 로그 생성
logger = logging.getLogger()

# 로그의 출력 기준 설정
logger.setLevel(logging.INFO)

# log 출력 형식
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# log 출력
stream_handler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)

subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'xgboost'])

import pandas as pd
import tarfile
import xgboost as xgb

if __name__=='__main__':
    logger.info('Starting preprocessing...')
    
    input_file = '/opt/ml/processing/input/boston.csv'
    input = pd.read_csv(input_file)
    
    logger.info('Input data')
    logger.info(input.head())
    
    logger.info('Loading trained XGBoost model...')
    
    model_artifacts = '/opt/ml/processing/model/model.tar.gz'
    with tarfile.open(model_artifacts) as model:
        model.extractall('/opt/ml/processing/model')
    
    loaded_model = xgb.Booster()
    loaded_model.load_model('/opt/ml/processing/model/model.json')
    
    logger.info('Starting batch prefiction...')
    predictions = loaded_model.inplace_predict(input.loc[:, input.columns != 'PRICE'])
    input['PREDICTED'] = predictions
    
    input.to_csv('/opt/ml/processing/results/results.csv', index=False)
    logger.info('Output data')
    logger.info(input.head())

Overwriting source_dir/preprocessing.py


In [27]:
from sagemaker.sklearn.processing import SKLearnProcessor

sklearn_processor = SKLearnProcessor(framework_version='0.23-1',
                                     role=role,
#                                      instance_type='ml.m5.xlarge',
                                     instance_type='local',
                                     instance_count=1)

In [28]:
from sagemaker.processing import ProcessingInput, ProcessingOutput
from time import gmtime, strftime 

processing_job_name = "xgboost-batch-{}".format(strftime("%d-%H-%M-%S", gmtime()))
output_destination = 's3://{}/{}/batch-output'.format(bucket, prefix)

inputs = [ProcessingInput(source=input_data,
                          destination='/opt/ml/processing/input',
                          s3_data_distribution_type='FullyReplicated'),
          ProcessingInput(source=model_artifacts,
                          destination='/opt/ml/processing/model',
                          s3_data_distribution_type='FullyReplicated')]

outputs = [ProcessingOutput(output_name='results',
                            source='/opt/ml/processing/results',
                            destination='{}/results'.format(output_destination))]

sklearn_processor.run(code='./source_dir/preprocessing.py',
                      job_name=processing_job_name,
                      inputs=inputs,
                      outputs=outputs)

preprocessing_job_description = sklearn_processor.jobs[-1].describe()


Job Name:  xgboost-batch-14-07-31-02
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-ap-northeast-2-889750940888/batch-transform/xgboost-byos/data', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'input-2', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-ap-northeast-2-889750940888/batch-transform/xgboost-byos/output/xgboost-batch-transform-2021-12-14-07-26-02-537/output/model.tar.gz', 'LocalPath': '/opt/ml/processing/model', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-ap-northeast-2-889750940888/xgboost-batch-14-07-31-02/input/code/preprocessing.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File

### Download preprocessed data

In [29]:
os.makedirs('./results', exist_ok=True)

In [30]:
!aws s3 sync {output_destination} ./results

In [31]:
!ls -lat ./results/results

total 56
drwxrwxr-x 2 ec2-user ec2-user  4096 Dec 14 07:02 .
drwxrwxr-x 3 ec2-user ec2-user  4096 Dec 14 07:02 ..
-rw-rw-r-- 1 ec2-user ec2-user 48529 Dec 14 06:57 results.csv
