In [3]:
%matplotlib inline
import numpy as np
import os
import matplotlib.pyplot as plt
import azureml
from azureml.core import Workspace

# check core SDK version number
print("Azure ML SDK Version: ", azureml.core.VERSION)

Azure ML SDK Version:  1.2.0


In [4]:
ws = Workspace.from_config()
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep='\n')

Workspace name: cirrustest2
Azure region: southcentralus
Subscription id: c1d0a0ea-bf6e-4c1c-8b55-f1bdb0208df8
Resource group: awe-cirrus-rg


In [5]:
from azureml.core import Experiment

script_folder = './cluster-gpu-hyperopt'
os.makedirs(script_folder, exist_ok=True)

exp = Experiment(workspace=ws, name='hyperopt-tf-cluster-gpus')

In [18]:
import shutil

# the training logic is in the keras_mnist.py file.
shutil.copy('./keras_mnist-gpu.py', script_folder)

# the utils.py just helps loading data from the downloaded MNIST dataset into numpy arrays.
shutil.copy('./utils.py', script_folder)
shutil.copy('./datagenerator.py', script_folder)
shutil.copy('./unet.py', script_folder)

'./cluster-gpu-hyperopt/unet.py'

In [32]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# choose a name for your cluster KEEP THE NAME BETWEEN 6-12 chars. else it fails
cluster_name = "sn-gpu-cls8NC12"

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC12', 
                                                           max_nodes=8)

    # create the cluster
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

    # can poll for a minimum number of nodes and for a specific timeout. 
    # if no min node count is provided it uses the scale settings for the cluster
    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

# use get_status() to get a detailed status for the current cluster. 
print(compute_target.get_status().serialize())

Creating a new compute target...
Creating
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned
{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2020-04-12T11:43:34.403000+00:00', 'errors': None, 'creationTime': '2020-04-12T11:43:29.246965+00:00', 'modifiedTime': '2020-04-12T11:43:46.128367+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 8, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC12'}


In [33]:
from azureml.train.estimator import Estimator

script_params = {
  
    '--batch-size': 50,
    '--epochs':20
}

#https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/training-with-deep-learning/how-to-use-estimator/how-to-use-estimator.ipynb

est = Estimator(source_directory=script_folder,
                 script_params=script_params,
                 compute_target=compute_target, 
                 entry_script='keras_mnist-gpu.py', 
                 #pip_packages=['tensorflow==2.0','keras==2.2.5','azureml-dataprep[pandas,fuse]','matplotlib'],
                 #framework_version=2.0,
                # environ=some_file,
                #use_docker=True,
                #image_registry_details=container_registry,
                custom_docker_image="returncode13/tf-gpu-test:v2",
                 use_gpu=True,
                user_managed=True
                )

In [34]:
from azureml.train.hyperdrive import RandomParameterSampling, BanditPolicy, HyperDriveConfig, PrimaryMetricGoal
from azureml.train.hyperdrive import choice, loguniform

ps = RandomParameterSampling(
    {
        '--batch-size': choice(25, 50, 100),
        '--epochs':choice(20,30,10)
    }
)

In [35]:
policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1)

In [36]:
hdc = HyperDriveConfig(estimator=est, 
                       hyperparameter_sampling=ps, 
                       policy=policy, 
                       primary_metric_name='Loss', 
                       primary_metric_goal=PrimaryMetricGoal.MINIMIZE, 
                       max_total_runs=20,
                       max_concurrent_runs=8)

In [37]:
hdr = exp.submit(config=hdc)

The same input parameter(s) are specified in estimator/run_config script params and HyperDrive parameter space. HyperDrive parameter space definition will override these duplicate entries. ['--batch-size', '--epochs'] is the list of overridden parameter(s).


In [38]:
from azureml.widgets import RunDetails
RunDetails(hdr).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [39]:
hdr

Experiment,Id,Type,Status,Details Page,Docs Page
hyperopt-tf-cluster-gpus,HD_c2807a60-4a3c-4434-a378-2b3599b1b259,hyperdrive,Running,Link to Azure Machine Learning studio,Link to Documentation


In [40]:
hdr.wait_for_completion(show_output=True)

RunId: HD_c2807a60-4a3c-4434-a378-2b3599b1b259
Web View: https://ml.azure.com/experiments/hyperopt-tf-cluster-gpus/runs/HD_c2807a60-4a3c-4434-a378-2b3599b1b259?wsid=/subscriptions/c1d0a0ea-bf6e-4c1c-8b55-f1bdb0208df8/resourcegroups/awe-cirrus-rg/workspaces/cirrustest2

Streaming azureml-logs/hyperdrive.txt

"<START>[2020-04-12T11:43:54.887863][API][INFO]Experiment created<END>\n""<START>[2020-04-12T11:43:55.909106][GENERATOR][INFO]Trying to sample '8' jobs from the hyperparameter space<END>\n"

Execution Summary
RunId: HD_c2807a60-4a3c-4434-a378-2b3599b1b259
Web View: https://ml.azure.com/experiments/hyperopt-tf-cluster-gpus/runs/HD_c2807a60-4a3c-4434-a378-2b3599b1b259?wsid=/subscriptions/c1d0a0ea-bf6e-4c1c-8b55-f1bdb0208df8/resourcegroups/awe-cirrus-rg/workspaces/cirrustest2



{'runId': 'HD_c2807a60-4a3c-4434-a378-2b3599b1b259',
 'target': 'sn-gpu-cls8NC12',
 'status': 'Completed',
 'startTimeUtc': '2020-04-12T11:43:54.253201Z',
 'endTimeUtc': '2020-04-12T12:03:04.866797Z',
 'properties': {'primary_metric_config': '{"name": "Loss", "goal": "minimize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': '1da65fad-87f8-4398-829e-b16c2eeadeaf',
  'score': '0.029998686062172056',
  'best_child_run_id': 'HD_c2807a60-4a3c-4434-a378-2b3599b1b259_9',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://cirrustest28363900696.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_c2807a60-4a3c-4434-a378-2b3599b1b259/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=hSzF1YYyQVVdL0%2FkgoPzOznpTj5VycZsSFVZVquV2RU%3D&st=2020-04-12T11%3A53%3A07Z&se=2020-04-12T20%3A03%3A07Z&sp=r'}}

In [41]:
best_run = hdr.get_best_run_by_primary_metric()
print(best_run.get_details()['runDefinition']['arguments'])

['--batch-size', '25', '--epochs', '30']


In [42]:
print(best_run.get_file_names())

['azureml-logs/55_azureml-execution-tvmps_7e5c7bd4cfcd84087ab1d328ff63b59fcb1b3a275828ec666786c05bd70f9053_d.txt', 'azureml-logs/65_job_prep-tvmps_7e5c7bd4cfcd84087ab1d328ff63b59fcb1b3a275828ec666786c05bd70f9053_d.txt', 'azureml-logs/70_driver_log.txt', 'azureml-logs/75_job_post-tvmps_7e5c7bd4cfcd84087ab1d328ff63b59fcb1b3a275828ec666786c05bd70f9053_d.txt', 'azureml-logs/process_info.json', 'azureml-logs/process_status.json', 'logs/azureml/455_azureml.log', 'logs/azureml/job_prep_azureml.log', 'logs/azureml/job_release_azureml.log', 'outputs/model/model.h5', 'outputs/model/model.json']


In [43]:
model = best_run.register_model(model_name='gpu-unet', model_path='outputs/model')