In [None]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker import image_uris

In [None]:
session = sagemaker.Session()

role = get_execution_role()


In [None]:
# If you're following along, you'll need to upload these datasets to your own bucket in S3.

test_location = #'s3://qs-sagemaker-demo-data/test.csv'
val_location = #'s3://qs-sagemaker-demo-data/validation.csv'
train_location = #'s3://qs-sagemaker-demo-data/train.csv'

In [None]:
session.default_bucket()

In [None]:
# We use this prefix to help us determine where the output will go.

prefix = session.default_bucket()

In [None]:
# We need to get the location of the container.

container = image_uris.retrieve('xgboost', session.boto_region_name, version='latest')

In [None]:
# Now that we know which container to use, we can construct the estimator object.
xgb = sagemaker.estimator.Estimator(container, # The image name of the training container
                                    role,      # The IAM role to use (our current role in this case)
                                    instance_count=1, # The number of instances to use for training
                                    instance_type='ml.m4.xlarge', # The type of instance to use for training
                                    output_path='s3://{}/output'.format(prefix),
                                                                        # Where to save the output (the model artifacts)
                                    sagemaker_session=session) # The current SageMaker session

In [None]:
# These hyperparameters are beyond the scope of this training session, but you can research the algoirthm here:
# https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost.html

xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        objective='reg:linear',
                        early_stopping_rounds=10,
                        num_round=200)

In [None]:
s3_input_train = sagemaker.inputs.TrainingInput(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data=val_location, content_type='csv')

In [None]:
# The fit method launches the training job.

xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})