# Running RAPIDS hyperparameter experiments at scale on Amazon SageMaker

##### Import packages and create Amazon SageMaker and Boto3 sessions

In [1]:
import os
import numpy as np
import time
import numpy as np
import sagemaker
import time
import boto3
from sagemaker.s3 import S3Uploader

sess = boto3.Session()
s3 = boto3.resource('s3')
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

##### Download the higgs-boson dataset

In [None]:
!mkdir dataset
!wget -P dataset https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz
!gunzip dataset/HIGGS.csv.gz

##### Download the RAPIDS container from Docker hub

In [3]:
!docker pull rapidsai/rapidsai:cuda10.0-runtime-ubuntu16.04

cuda10.0-runtime-ubuntu16.04: Pulling from rapidsai/rapidsai
Digest: sha256:d83cb5d56e82acba57aae0eef7d2266431b6700ea66e4067e4222cef8bd42162
Status: Image is up to date for rapidsai/rapidsai:cuda10.0-runtime-ubuntu16.04


##### Define hyperparameters: start with best guess values
Find the full list of Random Forest hyperparameters here in the RAPIDS doc page:
<br>
https://rapidsai.github.io/projects/cuml/en/0.11.0/api.html#random-forest

In [63]:
hyperparams={ 
    'n_estimators'       : 15,
    'max_depth'          : 5,
    'n_bins'             : 8,
    'split_criterion'    : 0,      # GINI:0, ENTROPY:1
    'split_algo'         : 0,      # HIST:0 GLOBAL_QUANTILE:1
    'bootstrap'          : 0,      # true: sample with replacement, false: sample without replacement
    'bootstrap_features' : 0,      # true: sample with replacement, false: sample without replacement
    'max_leaves'         : -1,     # unlimited leaves
    'max_features'       : 0.2, 
}

##### Before we run a large scale experiment, test training locally using the SageMaker SDK

##### Extend RAPIDS container by copying the training script and installing SageMaker containers which makes RAPIDS compatible with SageMaker

In [9]:
!cat docker/Dockerfile

FROM rapidsai/rapidsai:cuda10.0-runtime-ubuntu16.04

RUN apt-get update && apt-get install -y --no-install-recommends build-essential 

RUN source activate rapids && pip install sagemaker-containers

# Copies the training code inside the container
COPY rapids-higgs.py /opt/ml/code/rapids-higgs.py

# Defines rapids-higgs.py as script entry point
ENV SAGEMAKER_PROGRAM rapids-higgs.py


In [14]:
!docker build -t sagemaker-rapids:latest docker

Sending build context to Docker daemon  11.78kB
Step 1/5 : FROM rapidsai/rapidsai:cuda10.0-runtime-ubuntu16.04
 ---> 23341e245c4d
Step 2/5 : RUN apt-get update && apt-get install -y --no-install-recommends build-essential
 ---> Using cache
 ---> 5b28bb2de85f
Step 3/5 : RUN source activate rapids && pip install sagemaker-containers
 ---> Using cache
 ---> 6e69dc0b4116
Step 4/5 : COPY rapids-higgs.py /opt/ml/code/rapids-higgs.py
 ---> Using cache
 ---> 798e59534d52
Step 5/5 : ENV SAGEMAKER_PROGRAM rapids-higgs.py
 ---> Using cache
 ---> 7948e55aa3c4
Successfully built 7948e55aa3c4
Successfully tagged sagemaker-rapids:latest


In [64]:
from sagemaker.estimator import Estimator

train_instance_type = 'local_gpu'
local_data_dir = 'file://./dataset'

rapids_estimator = Estimator(image_name='sagemaker-rapids:latest',
                          role=role,
                          train_instance_count=1,
                          train_instance_type=train_instance_type,
                          hyperparameters=hyperparams,
                          metric_definitions=[{'Name': 'test_acc', 'Regex': 'test_acc: ([0-9\\.]+)'}])

In [65]:
%%time
rapids_estimator.fit({'dataset': local_data_dir})

Creating tmpksw0arl4_algo-1-ikchj_1 ... 
[1BAttaching to tmpksw0arl4_algo-1-ikchj_12mdone[0m
[36malgo-1-ikchj_1  |[0m 2020-02-03 23:12:32,332 sagemaker-containers INFO     Invoking user script
[36malgo-1-ikchj_1  |[0m 
[36malgo-1-ikchj_1  |[0m Training Env:
[36malgo-1-ikchj_1  |[0m 
[36malgo-1-ikchj_1  |[0m {
[36malgo-1-ikchj_1  |[0m     "additional_framework_parameters": {},
[36malgo-1-ikchj_1  |[0m     "channel_input_dirs": {
[36malgo-1-ikchj_1  |[0m         "dataset": "/opt/ml/input/data/dataset"
[36malgo-1-ikchj_1  |[0m     },
[36malgo-1-ikchj_1  |[0m     "current_host": "algo-1-ikchj",
[36malgo-1-ikchj_1  |[0m     "framework_module": null,
[36malgo-1-ikchj_1  |[0m     "hosts": [
[36malgo-1-ikchj_1  |[0m         "algo-1-ikchj"
[36malgo-1-ikchj_1  |[0m     ],
[36malgo-1-ikchj_1  |[0m     "hyperparameters": {
[36malgo-1-ikchj_1  |[0m         "n_estimators": 15,
[36malgo-1-ikchj_1  |[0m         "max_depth": 5,
[36malgo-1-ikchj_1  |[0m         "n_b

In [67]:
region = boto3.Session().region_name
account = boto3.client('sts').get_caller_identity().get('Account')
image = '{}.dkr.ecr.{}.amazonaws.com/sagemaker-rapids:latest'.format(account, region)

In [68]:
!aws ecr create-repository --repository-name sagemaker-rapids
!$(aws ecr get-login --no-include-email --region {region})
!docker tag sagemaker-rapids:latest {image}
!docker push {image}

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded
The push refers to repository [453691756499.dkr.ecr.us-west-2.amazonaws.com/sagemaker-rapids]

[1Bb32d3917: Preparing 
[1B98c6f529: Preparing 
[1Bb575b882: Preparing 
[1B6a090a5a: Preparing 
[1Be3e639c2: Preparing 
[1B8515050c: Preparing 
[1Bb4819abb: Preparing 
[1B56d33862: Preparing 
[1Bcbfdd831: Preparing 
[1Bc4e796e4: Preparing 
[1B80f86be3: Preparing 
[1B8c8d5f39: Preparing 
[1B7fbd6c92: Preparing 
[1B9d138968: Preparing 
[12Ba090a5a: Pushing  3.162GB/3.092GB[13A[1K[K[14A[1K[K[12A[1K[K[14A[1K[K[13A[1K[K[11A[1K[K[12A[1K[K[13A[1K[K[11A[1K[K[13A[1K[K[14A[1K[K[13A[1K[K[10A[1K[K[13A[1K[K[14A[1K[K[13A[1K[K[10A[1K[K[13A[1K[K[14A[1K[K[13A[1K[K[10A[1K[K[13A[1K[K[10A[1K[K[11A[1K[K[13A[1K[K[12A[1K[K[11A[1K[K[10A[1K[K[11A[1K[K[14A[1K[K[12A[1K[K[14A[1K[K[12A[1K[K[14A[1K[K[11A[1K[K[

In [69]:
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner

hyperparameter_ranges = {
    'n_estimators'        : IntegerParameter(10, 200), 
    'max_depth'           : IntegerParameter(1, 22),
    'n_bins'              : IntegerParameter(5, 24),
    'split_criterion'     : CategoricalParameter([0, 1]),
    'split_algo'          : CategoricalParameter([0, 1]),
    'bootstrap'           : CategoricalParameter([True, False]),
    'bootstrap_features'  : CategoricalParameter([True, False]),
    'max_features'        : ContinuousParameter(0.01, 0.5),
}

In [70]:
from sagemaker.estimator import Estimator

train_instance_type = 'ml.p3.2xlarge'
rapids_estimator = Estimator(image_name=image,
                          role=role,
                          train_instance_count=1,
                          train_instance_type=train_instance_type,
                          hyperparameters=hyperparams,
                          metric_definitions=[{'Name': 'test_acc', 'Regex': 'test_acc: ([0-9\\.]+)'}])

In [71]:
tuner = HyperparameterTuner(rapids_estimator,
                            objective_metric_name='test_acc',
                            hyperparameter_ranges=hyperparameter_ranges,
                            strategy='Bayesian',
                            max_jobs=1,
                            max_parallel_jobs=1,
                            objective_type='Maximize',
                            metric_definitions=[{'Name': 'test_acc', 'Regex': 'test_acc: ([0-9\\.]+)'}])

##### Upload it to the default SageMaker bucket on Amazon S3

In [None]:
s3_data_dir = sagemaker_session.upload_data(path='dataset', key_prefix='dataset/higgs-dataset')

In [75]:
job_name = 'rapidsHPO' + time.strftime('%Y-%m-%d-%H-%M-%S-%j', time.gmtime())
tuner.fit({'dataset': s3_data_dir}, job_name=job_name)

## Clean up

- Delete S3 buckets and files you don't need
- Kill training jobs that you don't want running
- Delete container images and the repository you just created

In [None]:
aws ecr delete-repository --force --repository-name