# BYO container SageMaker Hyperparameter Search Experiments

In [None]:
import os
import numpy as np
import sagemaker
import boto3
import time
import math
import json
import re
import pandas as pd
from sagemaker.session import s3_input
from sagemaker.estimator import Estimator

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

client = boto3.client('sagemaker')
region = boto3.Session().region_name
account = boto3.client('sts').get_caller_identity().get('Account')

In [None]:
bucket_name = 'reinvent-409'
repo_name = 'reinvent-409'
image_tag = 'pytorch'

In [None]:
def train_experiment_run(job_name, hyperparams):
    output_path = 's3://{}/'.format(bucket_name)
    train_path = 's3://{}/cifar10-dataset/cifar-10-batches-py'.format(bucket_name)
    eval_path = 's3://{}/cifar10-dataset/cifar-10-batches-py'.format(bucket_name)
    image = '{}.dkr.ecr.{}.amazonaws.com/{}:{}'.format(account, region, repo_name, image_tag)
    
    estimator = Estimator(
        image_name=image,
        role=role,
        train_instance_count=1,
        train_instance_type='ml.p3.2xlarge',
        output_path=output_path,
        hyperparameters=hyperparams,
        sagemaker_session=sagemaker_session,
        metric_definitions=[{'Name': 'test_acc', 'Regex': 'test_acc:([0-9\\.]+)'}])

    estimator.fit({'training': train_path, 'eval': eval_path}, wait=False, job_name=job_name)
    return estimator._current_job_name

In [None]:
with open("experiment_set-small.json", 'r') as stream:
        experiment_param_set = json.load(stream)
num_experiment_runs = len(experiment_param_set)
print('Number of experiment runs: ', num_experiment_runs)

In [None]:
base_name = 'reinvent-aim409-' + time.strftime('%Y-%m-%d-%H-%M-%S-%j', time.gmtime()) + '-'
max_parallel_jobs = 2
jobs = {}
job_names = []

exp_batches = int(num_experiment_runs/max_parallel_jobs)
exp_run_number = 0

for i in range(exp_batches):
    running_jobs = 0
    running_job_names = []
    for j in range(max_parallel_jobs):
        time.sleep(2)
        job = base_name + 'expbatch-' + str(i)+ 'job-' + str(j)
        hps = experiment_param_set[exp_run_number]
        jobs[job] = hps.copy()
        running_job_names.append(job)
        jname = train_experiment_run(job, hps)
        job_names.append(jname)
        running_jobs = running_jobs+1
        exp_run_number = exp_run_number+1

    while running_jobs > 0:
        for job in running_job_names:
            if client.describe_training_job(TrainingJobName=job)['TrainingJobStatus'] != 'InProgress':
                running_jobs = running_jobs - 1
        time.sleep(20)

In [None]:
experiment_results = pd.DataFrame(columns=['job_name', 'batch_norm', 'batch_size', 'data_aug_cutout_size', 'epochs', 'max_learning_rate', 'momentum', 'test_acc'])
for i in range(len(job_names)):
    job_summary = client.describe_training_job(TrainingJobName=job_names[i])
    accuracy = job_summary['FinalMetricDataList'][0]['Value']
    hyp = pd.DataFrame.from_dict([job_summary['HyperParameters']])
    hyp['test_acc'] = accuracy
    hyp['job_name'] = job_names[i]
    experiment_results = experiment_results.append(hyp,sort=False)

In [None]:
experiment_results