<img src='img/header.png'>

In [None]:
# please choose code variant
ml_workflow_choice = 'multiGPU' 

assert (ml_workflow_choice in ['singleCPU', 'singleGPU', 'multiCPU', 'multiGPU'])

In [None]:
import sagemaker
from helper_functions import *

In [None]:
execution_role = sagemaker.get_execution_role()
session = sagemaker.Session()
# account = ['local']
# execution_role = 'SageMakerRole'

# account=!(aws sts get-caller-identity --query Account --output text)
# region=!(aws configure get region)
account = ['561241433344']
region = ['us-east-1']

In [None]:
account, region

| dataset | data_bucket | dataset_directory | # samples | storage type | time span |
|---|---|---|---|---|---|
| Airline Stats Small    | demo    | 1_year   | 6.3M   | Parquet     | 2019         |
| Airline Stats Medium   | demo    | 3_year   | 18M    | Parquet     | 2019-2017    |
| Airline Stats Large    | demo    | 10_year  | 63M    | Parquet     | 2019-2010    |
| NYC Taxi               | demo    | NYC_taxi | 6.3M   | CSV         | 2020 January |
| Bring Your Own Dataset | custom  | custom   | custom | Parquet/CSV | custom       |

In [None]:
# please choose dataset S3 bucket and directory
data_bucket = 'sagemaker-rapids-hpo-' + region[0]
dataset_directory = '10_year' # '1_year', '3_year', '10_year', 'NYC_taxi'

# please choose output bucket for trained model(s)
# model_output_bucket = session.default_bucket()

In [None]:
s3_data_input = f"file://data/10_year"  # f"s3://{data_bucket}/{dataset_directory}"
# s3_model_output = f"s3://{model_output_bucket}/trained-models"
s3_model_output = f"file://home/nfs/syurick/cloud-ml-examples/aws/trained-models"

best_hpo_model_local_save_directory = os.getcwd()

In [None]:
# please choose learning algorithm
algorithm_choice = 'KMeans'

assert (algorithm_choice in ['XGBoost', 'RandomForest', 'KMeans'])

In [None]:
# please choose cross-validation folds
cv_folds = 3

assert (cv_folds >= 1)

In [None]:
# please choose HPO search ranges
hyperparameter_ranges = {
    'max_depth'    : sagemaker.parameter.IntegerParameter        ( 5, 15 ),
    'n_estimators' : sagemaker.parameter.IntegerParameter        ( 100, 500 ),
    'max_features' : sagemaker.parameter.ContinuousParameter     ( 0.1, 1.0 ),    
} # see note above for adding additional parameters

In [None]:
if 'XGBoost' in algorithm_choice: 
    # number of trees parameter name difference b/w XGBoost and RandomForest
    hyperparameter_ranges['num_boost_round'] = hyperparameter_ranges.pop('n_estimators')

In [None]:
if 'KMeans' in algorithm_choice:
    hyperparameter_ranges = {
        'n_clusters' : sagemaker.parameter.IntegerParameter     ( 7, 10 ), 
        'max_iter'   : sagemaker.parameter.IntegerParameter     ( 290, 310 ),
    }

In [None]:
# please choose HPO search strategy
search_strategy = 'Random'

assert (search_strategy in ['Random', 'Bayesian'])

In [None]:
# please choose total number of HPO experiments[ we have set this number very low to allow for automated CI testing ]
max_jobs = 2

In [None]:
# please choose number of experiments that can run in parallel
max_parallel_jobs = 2

In [None]:
max_duration_of_experiment_seconds = 60 * 60 * 24

In [None]:
# we will recommend a compute instance type, feel free to modify 
instance_type = "local_gpu"  # recommend_instance_type(ml_workflow_choice, dataset_directory) 

In [None]:
# please choose whether spot instances should be used
use_spot_instances_flag = True

In [None]:
# s3_model_output = f"cloud-ml-examples//aws"

In [None]:
summarize_choices(s3_data_input, s3_model_output, ml_workflow_choice, algorithm_choice, 
                  cv_folds, instance_type, use_spot_instances_flag, search_strategy, 
                  max_jobs, max_parallel_jobs, max_duration_of_experiment_seconds)

<span style="display: block; text-align: center; color:#8735fb; font-size:30pt"> **1. ML Workflow** </span>

In [None]:
%cd code

In [None]:
# %load train.py

In [None]:
# %load workflows/MLWorkflowSingleGPU.py

In [None]:
rapids_base_container = 'rapidsai/rapidsai-cloud-ml:latest'

In [None]:
image_base = 'cloud-ml-sagemaker'
image_tag  = rapids_base_container.split(':')[1]

In [None]:
account = ['561241433344']

In [None]:
ecr_fullname = f"{account[0]}.dkr.ecr.{region[0]}.amazonaws.com/{image_base}:{image_tag}"

In [None]:
ecr_fullname

In [None]:
with open('Dockerfile', 'w') as dockerfile: 
    dockerfile.writelines( f'FROM {rapids_base_container} \n\n'
                           f'ENV AWS_DATASET_DIRECTORY="{dataset_directory}"\n'
                           f'ENV AWS_ALGORITHM_CHOICE="{algorithm_choice}"\n'
                           f'ENV AWS_ML_WORKFLOW_CHOICE="{ml_workflow_choice}"\n'
                           f'ENV AWS_CV_FOLDS="{cv_folds}"\n')

In [None]:
%%writefile -a Dockerfile

# ensure printed output/log-messages retain correct order
ENV PYTHONUNBUFFERED=True

# path where SageMaker looks for code when container runs in the cloud
ENV CLOUD_PATH="/opt/ml/code"

# copy our latest [local] code into the container 
COPY . $CLOUD_PATH

# make the entrypoint script executable
RUN chmod +x $CLOUD_PATH/entrypoint.sh

WORKDIR $CLOUD_PATH
ENTRYPOINT ["./entrypoint.sh"]

In [None]:
validate_dockerfile(rapids_base_container)
!cat Dockerfile

In [None]:
!docker pull $rapids_base_container

In [None]:
%%time
!docker build . -t $ecr_fullname -f Dockerfile

In [None]:
docker_login_str = !(aws ecr get-login --region {region[0]} --no-include-email)

In [None]:
!{docker_login_str[0]}

Create ECR repository [ if it doesn't already exist]

In [None]:
repository_query = !(aws ecr describe-repositories --repository-names $image_base)
if repository_query[0] == '':
    !(aws ecr create-repository --repository-name $image_base)

Let's now actually push the container to ECR
> Note the first push to ECR may take some time (hopefully less than 10 minutes).

In [None]:
!docker push $ecr_fullname

<span style="color:#8735fb; font-size:20pt"> 2.2 - Create Estimator </span>

Having built our container [ +custom logic] and pushed it to ECR, we can finally compile all of efforts into an Estimator instance.

In [None]:
from sagemaker.local import LocalSession

# sagemaker_session = LocalSession()
sagemaker_session = sagemaker.Session()
# sagemaker_session.config = {'local': {'local_code': True}}

In [None]:
# 'volume_size' - EBS volume size in GB, default = 30
estimator_params = {
    'image_uri': ecr_fullname,
    'role': execution_role,    
    
    'instance_type': instance_type,
    'instance_count': 1,
    
    'input_mode': 'File',
    'output_path': s3_model_output,
    
    # 'use_spot_instances': use_spot_instances_flag,
    
    'max_run': max_duration_of_experiment_seconds, # 24 hours 
    # 'sagemaker_session': sagemaker_session,
}

if use_spot_instances_flag == True:
    estimator_params.update({'max_wait' : max_duration_of_experiment_seconds + 1})

In [None]:
estimator = sagemaker.estimator.Estimator(**estimator_params)

<span style="color:#8735fb; font-size:20pt"> 2.3 - Test Estimator </span>

Now we are ready to test by asking SageMaker to run the BYOContainer logic inside our Estimator. This is a useful step if you've made changes to your custom logic and are interested in making sure everything works before launching a large HPO search. 

> Note: This verification step will use the default hyperparameter values declared in our custom train code, as SageMaker HPO will not be orchestrating a search for this single run.

In [None]:
s3_data_input = f"file://data/10_year_downsampled" # f'file://home/nfs/syurick/cloud-ml-examples/aws
s3_model_output = f"file://trained-models"
summarize_choices(s3_data_input, s3_model_output, ml_workflow_choice, algorithm_choice, 
                  cv_folds, instance_type, use_spot_instances_flag, search_strategy, 
                  max_jobs, max_parallel_jobs, max_duration_of_experiment_seconds )

In [None]:
job_name = new_job_name_from_config(dataset_directory, region, ml_workflow_choice, 
                                    algorithm_choice, cv_folds,
                                    instance_type  )

In [None]:
%cd ..

Should be /home/nfs/syurick/cloud-ml-examples/aws

In [None]:
!pwd

In [None]:
%%time
estimator.fit(inputs = s3_data_input, job_name = job_name.lower())