# XGBoost 로컬 학습 후 Batch Transform w/ SM Processing

In [20]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.5.1-py3-none-manylinux2014_x86_64.whl (173.5 MB)
[K     |████████████████████████████████| 173.5 MB 59.9 MB/s eta 0:00:01   |█████                           | 27.3 MB 1.6 MB/s eta 0:01:29
Installing collected packages: xgboost
Successfully installed xgboost-1.5.1
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [186]:
import os
import sagemaker
import time
from time import gmtime, strftime 

## Prepare & Uplad dataset

In [107]:
os.makedirs('./data', exist_ok=True)

In [161]:
from sklearn.datasets import load_boston

boston = load_boston()

data = pd.DataFrame(boston.data)
data.columns = boston.feature_names
data['PRICE'] = boston.target

data.to_csv('./data/boston.csv', sep=',', index=False)

In [132]:
session = sagemaker.Session()
bucket = session.default_bucket()
prefix = 'batch-transform/xgboost-byos'
output_path = f"s3://{bucket}/{prefix}/output"

role = sagemaker.get_execution_role()

In [134]:
input_data = session.upload_data('./data', key_prefix=f'{prefix}/data')
input_data

's3://sagemaker-ap-northeast-2-889750940888/batch-transform/xgboost-byos/data'

In [135]:
!aws s3 ls {input_data} --recursive

2021-12-11 14:30:25      41085 batch-transform/xgboost-byos/data/.ipynb_checkpoints/boston-checkpoint.csv
2021-12-11 14:30:25      39170 batch-transform/xgboost-byos/data/boston.csv


In [185]:
%%writefile source_dir/train.py

import xgboost as xgb

import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston

import pickle as pkl
import argparse
import os

parser = argparse.ArgumentParser()

# Hyperparameters are described here.
parser.add_argument("--max_depth", type=int, default=5)
parser.add_argument("--eta", type=float, default=0.2)
parser.add_argument("--gamma", type=int, default=4)
parser.add_argument("--min_child_weight", type=int, default=6)
parser.add_argument("--subsample", type=float, default=0.7)
parser.add_argument("--verbosity", type=int, default=2)
parser.add_argument("--objective", type=str, default='reg:squarederror')
parser.add_argument("--num_round", type=int, default=50)
parser.add_argument("--tree_method", type=str, default="auto")
parser.add_argument("--predictor", type=str, default="auto")

# SageMaker specific arguments. Defaults are set in the environment variables.
parser.add_argument('--model_dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN'])
# parser.add_argument('--validation', type=str, default=os.environ['SM_CHANNEL_VALIDATION'])

args = parser.parse_args()

data = pd.read_csv(f'{args.train}/boston.csv')
X, y = data.iloc[:,:-1], data.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

train = xgb.DMatrix(X_train, y_train)
test = xgb.DMatrix(X_test, y_test)

train_hp = {
    "max_depth": args.max_depth,
    "eta": args.eta,
    "gamma": args.gamma,
    "min_child_weight": args.min_child_weight,
    "subsample": args.subsample,
    "verbosity": args.verbosity,
    "objective": args.objective,
    "tree_method": args.tree_method,
    "predictor": args.predictor,
}

model_xgb = xgb.train(params=train_hp, 
                      dtrain=train,
                      evals=[(train, "train"), (test, "validation")], 
                      num_boost_round=100,
                      early_stopping_rounds=20)

preds = model_xgb.predict(test)

rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

model_xgb.save_model(f'{args.model_dir}/model.json')

Overwriting source_dir/train.py


## Local mode training

In [144]:
hyperparameters = {
    "max_depth": "5",
    "eta": "0.2",
    "gamma": "4",
    "min_child_weight": "6",
    "subsample": "0.7",
    "objective": "reg:squarederror",
    "num_round": "50",
    "verbosity": "2",
}

In [145]:
from sagemaker.xgboost.estimator import XGBoost

use_spot_instances = False
job_name = "DEMO-xgboost-regression-" + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())
print("Training job", job_name)
checkpoint_s3_uri = (
    "s3://{}/{}/checkpoints/{}".format(bucket, prefix, job_name) if use_spot_instances else None
)
print("Checkpoint path:", checkpoint_s3_uri)

xgb_script_mode_estimator = XGBoost(
    entry_point="train.py",
    source_dir='source_dir',
    hyperparameters=hyperparameters,
    role=role,
    instance_count=1,
    instance_type='local',
    framework_version="1.3-1",
    output_path=output_path,
#     use_spot_instances=use_spot_instances,
#     max_run=max_run,
#     max_wait=max_wait,
    base_job_name='xgboost-batch-transform',
    checkpoint_s3_uri=checkpoint_s3_uri,
)

Training job DEMO-xgboost-regression-2021-12-11-14-35-52
Checkpoint path: None


In [146]:
xgb_script_mode_estimator.fit(
    {'train': input_data}
)

Creating r63pd0f5ud-algo-1-wqc6v ... 
Creating r63pd0f5ud-algo-1-wqc6v ... done
Attaching to r63pd0f5ud-algo-1-wqc6v
[36mr63pd0f5ud-algo-1-wqc6v |[0m [2021-12-11 14:35:56.319 22b88be7b30a:1 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None
[36mr63pd0f5ud-algo-1-wqc6v |[0m [2021-12-11:14:35:56:INFO] Imported framework sagemaker_xgboost_container.training
[36mr63pd0f5ud-algo-1-wqc6v |[0m [2021-12-11:14:35:56:INFO] No GPUs detected (normal if no gpus installed)
[36mr63pd0f5ud-algo-1-wqc6v |[0m [2021-12-11:14:35:56:INFO] Invoking user training script.
[36mr63pd0f5ud-algo-1-wqc6v |[0m [2021-12-11:14:35:56:INFO] Module train does not provide a setup.py. 
[36mr63pd0f5ud-algo-1-wqc6v |[0m Generating setup.py
[36mr63pd0f5ud-algo-1-wqc6v |[0m [2021-12-11:14:35:56:INFO] Generating setup.cfg
[36mr63pd0f5ud-algo-1-wqc6v |[0m [2021-12-11:14:35:56:INFO] Generating MANIFEST.in
[36mr63pd0f5ud-algo-1-wqc6v |[0m [2021-12-11:14:35:56:INFO] Installing module with the following command

In [147]:
!aws s3 cp s3://sagemaker-ap-northeast-2-889750940888/batch-transform/xgboost-byos/output/xgboost-batch-transform-2021-12-11-14-35-53-183/model.tar.gz .

download: s3://sagemaker-ap-northeast-2-889750940888/batch-transform/xgboost-byos/output/xgboost-batch-transform-2021-12-11-14-35-53-183/model.tar.gz to ./model.tar.gz


### #TODO Local deploy
https://github.com/aws/amazon-sagemaker-examples/blob/master/introduction_to_amazon_algorithms/xgboost_abalone/xgboost_inferenece_script_mode.ipynb

In [None]:
# from sagemaker.xgboost.model import XGBoostModel

# model_data = xgb_script_mode_estimator.model_data
# print(model_data)

# xgb_inference_model = XGBoostModel(
#     model_data=model_data,
#     role=role,
#     entry_point="inference.py",
#     source_dir='source_dir',
#     framework_version="1.3-1",
# )

In [None]:
# predictor = xgb_inference_model.deploy(
#     initial_instance_count=1,
# #     instance_type="ml.c5.xlarge",
#     instance_type='local'
# )

## Processing for Batch transform

In [160]:
model_artifacts = xgb_script_mode_estimator.model_data
model_artifacts

's3://sagemaker-ap-northeast-2-889750940888/batch-transform/xgboost-byos/output/xgboost-batch-transform-2021-12-11-14-35-53-183/model.tar.gz'

In [162]:
input_data

's3://sagemaker-ap-northeast-2-889750940888/batch-transform/xgboost-byos/data'

In [212]:
%%writefile source_dir/preprocessing.py

import sys
import subprocess

subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'xgboost'])

import pandas as pd
import tarfile
import xgboost as xgb

if __name__=='__main__':
    input_file = '/opt/ml/processing/input/boston.csv'
    input = pd.read_csv(input_file)
    
    model_artifacts = '/opt/ml/processing/model/model.tar.gz'
    with tarfile.open(model_artifacts) as model:
        model.extractall('/opt/ml/processing/model')
    
    loaded_model = xgb.Booster()
    loaded_model.load_model('/opt/ml/processing/model/model.json')
    
    predictions = loaded_model.inplace_predict(input.loc[:, input.columns != 'PRICE'])
    input['PREDICTED'] = predictions
    
    input.to_csv('/opt/ml/processing/results/results.csv', index=False)

Overwriting source_dir/preprocessing.py


In [213]:
from sagemaker.sklearn.processing import SKLearnProcessor

sklearn_processor = SKLearnProcessor(framework_version='0.23-1',
                                     role=role,
                                     instance_type='ml.m5.xlarge',
#                                      instance_type='local',
                                     instance_count=1)

In [214]:
from sagemaker.processing import ProcessingInput, ProcessingOutput
from time import gmtime, strftime 

processing_job_name = "xgboost-batch-{}".format(strftime("%d-%H-%M-%S", gmtime()))
output_destination = 's3://{}/{}/batch-output'.format(bucket, prefix)

inputs = [ProcessingInput(source=input_data,
                          destination='/opt/ml/processing/input',
                          s3_data_distribution_type='FullyReplicated'),
          ProcessingInput(source=model_artifacts,
                          destination='/opt/ml/processing/model',
                          s3_data_distribution_type='FullyReplicated')]

outputs = [ProcessingOutput(output_name='results',
                            source='/opt/ml/processing/results',
                            destination='{}/results'.format(output_destination))]

sklearn_processor.run(code='./source_dir/preprocessing.py',
                      job_name=processing_job_name,
                      inputs=inputs,
                      outputs=outputs)

preprocessing_job_description = sklearn_processor.jobs[-1].describe()


Job Name:  xgboost-batch-11-16-06-21
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-ap-northeast-2-889750940888/batch-transform/xgboost-byos/data', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'input-2', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-ap-northeast-2-889750940888/xgboost-batch-11-16-06-21/input/input-2/model.tar.gz', 'LocalPath': '/opt/ml/processing/model', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-ap-northeast-2-889750940888/xgboost-batch-11-16-06-21/input/code/preprocessing.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3