With all these powerful tools at our disposal, every data scientist should feel empowered to up-level their model before serving it to the world!

<img src='img/hpo.png'>

In [1]:
import sagemaker
import time
# from helper_functions import *
# working_directory = get_notebook_path()

In [2]:
execution_role = sagemaker.get_execution_role()
session = sagemaker.Session()

account=!(aws sts get-caller-identity --query Account --output text)
region=!(aws configure get region)

In [3]:
dataset_directory = '10_year'

In [5]:
region = ['us-west-2']
data_bucket = 'sagemaker-rapids-hpo-' + region[0]
s3_data_input = f"s3://{data_bucket}/{dataset_directory}"

model_output_bucket = session.default_bucket()
s3_model_output = f"s3://{model_output_bucket}/trained-models"

<span style="display: block; text-align: center; color:#8735fb; font-size:30pt"> **1. ML Workflow** </span>

<img src='img/ml_workflow.png' width='800'> 

In this demo we'll utilize the Airline dataset (Carrier On-Time Performance 1987-2020, available from the [Bureau of Transportation Statistics](https://transtats.bts.gov/Tables.asp?DB_ID=120&DB_Name=Airline%20On-Time%20Performance%20Data&DB_Short_Name=On-Time#)). 

The public dataset contains logs/features about flights in the United States (17 airlines) including:

* Locations and distance  ( `Origin`, `Dest`, `Distance` )
* Airline / carrier ( `Reporting_Airline` )
* Scheduled departure and arrival times ( `CRSDepTime` and `CRSArrTime` )
* Actual departure and arrival times ( `DpTime` and `ArrTime` )
* Difference between scheduled & actual times ( `ArrDelay` and `DepDelay` )
* Binary encoded version of late, aka our target variable ( `ArrDelay15` )

Using these features we'll build a classifier model to predict whether a flight is going to be more than 15 minutes late on arrival as it prepares to depart.

Having built our container [ +custom logic] and pushed it to ECR, we can finally compile all of efforts into an Estimator instance.

<img src='img/run_hpo.png'>

In [16]:
algorithm_choice = 'XGBoost'
cv_folds = 3
code_choice = 'singleGPU' 
search_strategy = 'Bayesian'
max_jobs = 8
max_parallel_jobs = 8
max_duration_of_experiment_seconds = 60 * 60 * 24
instance_type = 'ml.p3.2xlarge'

hyperparameter_ranges = {
    'max_depth'    : sagemaker.parameter.IntegerParameter        ( 5, 15 ),
    'num_boost_round' : sagemaker.parameter.IntegerParameter        ( 100, 500 ),
    'max_features' : sagemaker.parameter.ContinuousParameter     ( 0.1, 1.0 ),    
} 

estimator_params = {
    'image_uri' : '453691756499.dkr.ecr.us-west-2.amazonaws.com/rapids-sagemaker:0.14-cuda10.1-runtime-ubuntu18.04-py3.7',
    
    'instance_type' : instance_type,
    'instance_count' : 1,
    
    'max_run'  : max_duration_of_experiment_seconds, # 24 hours 
    
    'input_mode'  : 'File',
    'output_path' : s3_model_output,
    
    'sagemaker_session' : session,
    'role' : execution_role,
}

estimator = sagemaker.estimator.Estimator( **estimator_params)

metric_definitions = [{'Name': 'final-score', 'Regex': 'final-score: (.*);'}]
objective_metric_name = 'final-score'

hpo = sagemaker.tuner.HyperparameterTuner( estimator             = estimator,
                                           metric_definitions    = metric_definitions, 
                                           objective_metric_name = objective_metric_name,
                                           objective_type        = 'Maximize',
                                           hyperparameter_ranges = hyperparameter_ranges,
                                           strategy              = search_strategy,  
                                           max_jobs              = max_jobs,
                                           max_parallel_jobs     = max_parallel_jobs)

In [17]:
job_name=f'rapids-hpo-{time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())}'

hpo.fit( inputs = s3_data_input, 
         job_name = job_name, 
         wait = True, logs = 'All')

hpo.wait() # block until the .fit call above is completed

........................................................................................................................................................!
!


In [19]:
sagemaker.HyperparameterTuningJobAnalytics('rapids-hpo-2020-09-29-03-58-53').dataframe()

Unnamed: 0,max_depth,max_features,num_boost_round,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
0,5.0,0.301033,306.0,rapids-hpo-2020-09-29-03-58-53-008-6152948b,Completed,0.935847,2020-09-29 04:00:59+00:00,2020-09-29 04:04:38+00:00,219.0
1,12.0,0.379073,421.0,rapids-hpo-2020-09-29-03-58-53-007-8afc4e19,Completed,0.930685,2020-09-29 04:01:01+00:00,2020-09-29 04:07:28+00:00,387.0
2,8.0,0.81965,254.0,rapids-hpo-2020-09-29-03-58-53-006-8b42b56a,Completed,0.935289,2020-09-29 04:01:09+00:00,2020-09-29 04:04:57+00:00,228.0
3,11.0,0.744959,468.0,rapids-hpo-2020-09-29-03-58-53-005-1bb24bad,Completed,0.931334,2020-09-29 04:01:10+00:00,2020-09-29 04:06:56+00:00,346.0
4,9.0,0.472368,140.0,rapids-hpo-2020-09-29-03-58-53-004-1c806184,Completed,0.934814,2020-09-29 04:01:04+00:00,2020-09-29 04:04:53+00:00,229.0
5,14.0,0.482275,405.0,rapids-hpo-2020-09-29-03-58-53-003-f5df25a3,Completed,0.930248,2020-09-29 04:02:12+00:00,2020-09-29 04:10:21+00:00,489.0
6,5.0,0.39752,448.0,rapids-hpo-2020-09-29-03-58-53-002-46d3f55f,Completed,0.936355,2020-09-29 04:01:10+00:00,2020-09-29 04:04:37+00:00,207.0
7,13.0,0.592225,450.0,rapids-hpo-2020-09-29-03-58-53-001-03e4c956,Completed,0.930359,2020-09-29 04:01:10+00:00,2020-09-29 04:09:16+00:00,486.0
