In [3]:
import numpy as np                                # For matrix operations and numerical processing
import pandas as pd                               # For munging tabular data
import os                                         # For manipulating filepath names
import sagemaker                                  # Amazon SageMaker's Python SDK provides many helper functions
from sagemaker import image_uris
from sagemaker.inputs import TrainingInput
from sagemaker.estimator import Estimator
from sagemaker.serializers import CSVSerializer
from botocore.exceptions import ClientError

import boto3
import re

from sklearn import datasets
from sklearn.model_selection import train_test_split

In [4]:
def resolve_sm_role():
    client = boto3.Session(profile_name='default', region_name='us-east-1').client('iam')
    response_roles = client.list_roles(
        PathPrefix='/',
        # Marker='string',
        MaxItems=999
    )
    for role in response_roles['Roles']:
        if role['RoleName'].startswith('AmazonSageMaker-ExecutionRole-'):
            return role['Arn']
    raise Exception('Could not resolve what should be the SageMaker role to be used')
    
def upload_file(bucket, file_name, prefix):
    """Upload a file to an S3 bucket

    :param file_name: File to upload
    :param bucket: Bucket to upload to
    :param object_name: S3 object name. If not specified then file_name is used
    :return: True if file was uploaded, else False
    """

    # Upload the file
    s3_client = boto3.Session(profile_name='default', region_name='us-east-1').client('s3')
    try:
        response = s3_client.upload_file(file_name, bucket, os.path.join(prefix, file_name))
    except ClientError as e:
        logging.error(e)
        return False
    return True

In [5]:
role = resolve_sm_role()

prefix = 'xgboost/iris'
bucket = 'siakondev-sagemaker-models-us-east-1'

In [6]:
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [8]:
pd.DataFrame(X_train).to_csv("x_train.csv", index=False, header=False)
pd.DataFrame(y_train).to_csv("y_train.csv", index=False, header=False)

pd.DataFrame(X_test).to_csv("x_test.csv", index=False, header=False)
pd.DataFrame(y_test).to_csv("y_test.csv", index=False, header=False)

In [9]:
upload_file(bucket, 'x_train.csv', f'{prefix}/train')
upload_file(bucket, 'y_train.csv', f'{prefix}/train')
upload_file(bucket, 'x_test.csv', f'{prefix}/test')
upload_file(bucket, 'y_test.csv', f'{prefix}/test')

True

In [10]:
s3_input_train = TrainingInput(s3_data=f"s3://{bucket}/{prefix}/train", content_type='csv')
s3_input_validation = TrainingInput(s3_data=f"s3://{bucket}/{prefix}/test", content_type='csv')

In [11]:
s3_input_train, s3_input_train

(<sagemaker.inputs.TrainingInput at 0x12c34c198>,
 <sagemaker.inputs.TrainingInput at 0x12c34c198>)

In [12]:
container = image_uris.retrieve('xgboost', 'us-east-1', '1.2-1' )

In [13]:
s3_output_location = f's3://{bucket}/{prefix}/model_output'

In [14]:
role, container, s3_output_location

('arn:aws:iam::988095220859:role/service-role/AmazonSageMaker-ExecutionRole-20180610T105283',
 '683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.2-1',
 's3://siakondev-sagemaker-models-us-east-1/xgboost/iris/model_output')

In [15]:
xgb = sagemaker.estimator.Estimator(
    container, 
    role=role,
    instance_count=1, 
    instance_type='ml.m5.large',
    input_mode= 'File',
    output_path=s3_output_location,
    session=boto3.Session(profile_name='default', region_name='us-east-1')
)

In [16]:
xgb.set_hyperparameters(
    max_depth=5,
    eta=0.2,
    gamma=4,
    min_child_weight=6,
    subsample=0.8,
    objective='binary:logistic',
    num_round=100
)

In [17]:
xgb.fit({'train': s3_input_train, 'validation': s3_input_validation}, logs=True)

2021-05-22 13:18:07 Starting - Starting the training job...
2021-05-22 13:18:50 Starting - Launching requested ML instancesProfilerReport-1621689479: InProgress
......
2021-05-22 13:19:50 Starting - Preparing the instances for training...
2021-05-22 13:20:40 Downloading - Downloading input data..
2021-05-22 13:21:10 Training - Downloading the training image...
2021-05-22 13:21:53 Uploading - Uploading generated training model
2021-05-22 13:21:53 Failed - Training job failed
[34m[2021-05-22 13:21:45.578 ip-10-2-124-78.ec2.internal:1 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm m

UnexpectedStatusException: Error for Training job sagemaker-xgboost-2021-05-22-13-17-59-856: Failed. Reason: AlgorithmError: framework error: 
Traceback (most recent call last):
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_xgboost_container/data_utils.py", line 287, in _get_csv_dmatrix_file_mode
    dmatrix = xgb.DMatrix('{}?format=csv&label_column=0&delimiter={}'.format(files_path, delimiter))
  File "/miniconda3/lib/python3.7/site-packages/xgboost/core.py", line 438, in __init__
    feature_types=feature_types)
  File "/miniconda3/lib/python3.7/site-packages/xgboost/data.py", line 497, in dispatch_data_backend
    return _from_uri(data, missing, feature_names, feature_types)
  File "/miniconda3/lib/python3.7/site-packages/xgboost/data.py", line 458, in _from_uri
    ctypes.byref(handle)))
  File "/miniconda3/lib/python3.7/site-packages/xgboost/core.py", line 188, in _check_call
    raise XGBoostError(py_str(_LIB.XGBGetLastError()))
xgboost.core.XGBoostError: [13:21:45] ../src/data/data.cc:672: Encountered parser error:
[13:21:45] ../dmlc-core/src/data/csv_parser.h:130: Delimiter '0' is not found in th