### Install sagemaker-experiments

In [1]:
import sagemaker
import boto3

session = sagemaker.Session()
sm = boto3.Session().client('sagemaker')
role = sagemaker.get_execution_role()

### Split Data

In [2]:
import pandas as pd

RANDOM_STATE = 99

DATA_FILE = './aws_sagemaker/scikit-learn/classification/iris_parameterized/data/iris.csv'

# load csv in memory
data = pd.read_csv(DATA_FILE)
data.head()

Unnamed: 0.1,Unnamed: 0,sepal length,sepal width,petal length,petal width,class
0,0,5.1,3.5,1.4,0.2,0.0
1,1,4.9,3.0,1.4,0.2,0.0
2,2,4.7,3.2,1.3,0.2,0.0
3,3,4.6,3.1,1.5,0.2,0.0
4,4,5.0,3.6,1.4,0.2,0.0


In [3]:
# split data into test and training
train_data = data.sample(frac=0.8, random_state=RANDOM_STATE)
test_data = data.drop(train_data.index)

test_data = test_data.drop(['class'],axis=1)
test_data.head()

Unnamed: 0.1,Unnamed: 0,sepal length,sepal width,petal length,petal width
5,5,5.4,3.9,1.7,0.4
12,12,4.8,3.0,1.4,0.1
17,17,5.1,3.5,1.4,0.3
19,19,5.1,3.8,1.5,0.3
20,20,5.4,3.4,1.7,0.2


#### Define S3 Paths

In [8]:
BUCKET_NAME = 'snowflake-getting-started'
BASE_PREFIX = 'iris'

INPUT_DATA_PREFIX = BASE_PREFIX+'/data/input'
TRAIN_DATA_PREFIX = BASE_PREFIX+'/data/input/train'
TEST_DATA_PREFIX = BASE_PREFIX+'/data/input/test'

EXPERIMENTS_OUTPUT_LOC = 's3://'+BUCKET_NAME+'/'+BASE_PREFIX+'/hypertuning'
EXPERIMENTS_TRAINING_METRICS_PREFIX = BASE_PREFIX + '/hypertuning'
EXP_SOURCE_CODE=EXPERIMENTS_OUTPUT_LOC+'/hypertuning/code'

print ('Experiment metadata would be published at -',EXPERIMENTS_OUTPUT_LOC)

Experiment metadata would be published at - s3://snowflake-getting-started/iris/hypertuning


#### Upload data to S3

In [9]:
print ('Uploading train data to s3')

s3_input_data_path = session.upload_data(path=DATA_FILE, 
                           bucket=BUCKET_NAME, 
                           key_prefix=INPUT_DATA_PREFIX)

print ('Input data uploaded to -', s3_input_data_path)

Uploading train data to s3
Input data uploaded to - s3://snowflake-getting-started/iris/data/input/iris.csv


In [43]:
from sagemaker.sklearn import SKLearn
from smexperiments.trial import Trial
from sagemaker.debugger import rule_configs, Rule, DebuggerHookConfig, CollectionConfig
from sagemaker.tuner import HyperparameterTuner, IntegerParameter
import csv

sklearnestimator = SKLearn(entry_point='./aws_sagemaker/scikit-learn/classification/iris_parameterized/train.py',
                              train_instance_type='ml.c4.xlarge',
                              sagemaker_session = session,
                              role = role,
                              code_location  = EXP_SOURCE_CODE,
                              input_mode='File',
                              metric_definitions=[
                                {'Name':'test:f1-score', 'Regex':'Test F1-Score: (.*)'},
                                {'Name':'test:accuracy', 'Regex':'Test Accuracy: (.*)'}
                              ],
                              enable_sagemaker_metrics=True,
                              output_path = 's3://snowflake-getting-started/iris/hypertuning'
                        )

# Configure HyperparameterTuner
irisTuner = HyperparameterTuner(estimator=sklearnestimator,  # previously-configured Estimator object
                               objective_metric_name='test:accuracy',
                               hyperparameter_ranges={'max_iter': IntegerParameter(5,7)},
                               metric_definitions=[{'Name': 'test:accuracy', 'Regex':'Test Accuracy: (.*)'}],
                               max_jobs=2,
                               max_parallel_jobs=1)

# Start hyperparameter tuning job
irisTuner.fit(inputs={'training': s3_input_data_path},
              logs=True,
              wait=True)


INFO:root:_TuningJob.start_new!!!
INFO:sagemaker:Creating hyperparameter tuning job with name: sagemaker-scikit-lea-200727-2131


In [56]:
print (irisTuner.describe()['TrainingJobStatusCounters'])

{'Completed': 1, 'InProgress': 1, 'RetryableError': 0, 'NonRetryableError': 0, 'Stopped': 0}


In [61]:
irisTuner.analytics().dataframe()

Unnamed: 0,max_iter,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
0,6.0,sagemaker-scikit-lea-200727-2131-002-775d484e,Completed,1.0,2020-07-27 21:36:53+00:00,2020-07-27 21:37:50+00:00,57.0
1,5.0,sagemaker-scikit-lea-200727-2131-001-3e5520dd,Completed,0.947368,2020-07-27 21:33:43+00:00,2020-07-27 21:34:46+00:00,63.0


### Compare the model training runs for an experiment

Now we will use the analytics capabilities of Python SDK to query and compare the training runs for identifying the best model produced by our experiment. You can retrieve trial components by using a search expression.

### Some Simple Analyses

In [None]:
# Retrieve analytics object
irisTuner_analytics = irisTuner.analytics()

# Look at summary of associated training jobs
tuner_dataframe = irisTuner_analytics.dataframe()

print(tuner_dataframe)