# Random Forest Classification

In [None]:
import os
import numpy as np
import time
import numpy as np
import sagemaker
import time
import boto3
from sagemaker.session import s3_input

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

client = boto3.client('sagemaker')
region = boto3.Session().region_name
account = boto3.client('sts').get_caller_identity().get('Account')

In [None]:
hyperparams={ 
    'n_estimators'       : 20,
    'max_depth'          : 10,
    'n_bins'             : 8,
    'split_criterion'    : 0,      # GINI:0, ENTROPY:1
    'split_algo'         : 0,      # HIST:0 GLOBAL_QUANTILE:1
    'bootstrap'          : True,   # sample with replacement
    'bootstrap_features' : False,  # sample without replacement
    'max_leaves'         : -1,     # unlimited leaves
    'max_features'       : 0.2, 
}

In [None]:
from sagemaker.estimator import Estimator

output_path = 's3://rapids-demos/'
image = '{}.dkr.ecr.{}.amazonaws.com/sagemaker-rapids:latest'.format(account, region)

rapids_estimator = Estimator( image_name=image,
                          role=role,
                          train_instance_count=1,
                          train_instance_type='ml.p3.2xlarge',
                          output_path=output_path,
                          hyperparameters=hyperparams,
                          metric_definitions=[{'Name': 'test_acc', 'Regex': 'test_acc: ([0-9\\.]+)'}])

data_dir = 's3://rapids-demos/dataset'

In [None]:
job_name = 'rapids-sagemaker-' + time.strftime('%Y-%m-%d-%H-%M-%S-%j', time.gmtime())
rapids_estimator.fit({'dataset': data_dir}, job_name=job_name)

In [None]:
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner

hyperparameter_ranges = {
    'n_estimators'        : IntegerParameter(10, 200), 
    'max_depth'           : IntegerParameter(10,100),
    'n_bins'              : IntegerParameter(5,30),
    'split_criterion'     : CategoricalParameter([0, 1]),
    'split_algo'          : CategoricalParameter([0, 1]),
    'bootstrap'           : CategoricalParameter([True, False]),
    'bootstrap_features'  : CategoricalParameter([True, False]),
    'max_features'        : ContinuousParameter(0.01, 0.5),
}

In [None]:
tuner = HyperparameterTuner(rapids_estimator,
                            objective_metric_name='test_acc',
                            hyperparameter_ranges=hyperparameter_ranges,
                            strategy='Random',
                            max_jobs=32,
                            max_parallel_jobs=8,
                            objective_type='Maximize',
                            metric_definitions=[{'Name': 'test_acc', 'Regex': 'test_acc: ([0-9\\.]+)'}])

In [None]:
job_name = 'rapidsHPO' + time.strftime('%Y-%m-%d-%H-%M-%S-%j', time.gmtime())
tuner.fit({'dataset': data_dir}, job_name=job_name)