In [7]:
import azureml.core
from azureml.core import Workspace, Experiment
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.train.dnn import TensorFlow
from azureml.train.hyperdrive import *
from azureml.widgets import RunDetails

print("Azure ML SDK Version: ", azureml.core.VERSION)

Azure ML SDK Version:  1.0.2


Connect to a workspace from `.\aml_config\config.json` file

In [6]:
# Connect to a workspace
ws = Workspace.from_config()
print("Workspace name: ", ws.name)

Found the config file in: C:\Users\jumin\git\Recommenders\notebooks\02_model\aml_config\config.json
Workspace name:  junmin-aml-workspace


In [9]:
ws.get_default_datastore().as_mount()

$AZUREML_DATAREFERENCE_workspaceblobstore

Create a remote compute target

In [10]:
CLUSTER_NAME = 'gpu-cluster'

try:
    compute_target = ComputeTarget(workspace=ws, name=CLUSTER_NAME)
    print('Found existing compute target')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', # STANDARD_NC24s_v3
                                                           min_nodes=1,
                                                           max_nodes=4)
    # create the cluster
    compute_target = ComputeTarget.create(ws, CLUSTER_NAME, compute_config)

    # can poll for a minimum number of nodes and for a specific timeout. 
    # if no min node count is provided it uses the scale settings for the cluster
    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

# Use the 'status' property to get a detailed status for the current cluster. 
print(compute_target.status.serialize())

compute_targets = ws.compute_targets
for name, ct in compute_targets.items():
    print(name, ct.type, ct.provisioning_state)

Creating a new compute target...
Creating
Succeeded.....................
AmlCompute wait for completion finished
Minimum number of nodes requested have been provisioned
{'allocationState': 'Steady', 'allocationStateTransitionTime': '2018-12-20T14:47:28.945000+00:00', 'creationTime': '2018-12-20T14:45:00.343249+00:00', 'currentNodeCount': 1, 'errors': None, 'modifiedTime': '2018-12-20T14:45:28.941889+00:00', 'nodeStateCounts': {'idleNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0, 'preparingNodeCount': 1, 'runningNodeCount': 0, 'unusableNodeCount': 0}, 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 1, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'targetNodeCount': 1, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC6'}
gpu-cluster AmlCompute Succeeded


Prepare dataset

In [None]:
DATA_PATH = 'movielens_100k'

# TODO Download dataset and upload to datastore
# ds = ws.get_default_datastore()
# ds.upload(src_dir='./'+DATA_PATH, target_path=DATA_PATH, overwrite=True, show_progress=True)

Prepare training script

In [None]:
DEEP_MODEL_TRAIN_SCRIPT = 'deep_model_train.py'
SCRIPT_FOLDER = './model_train'


Set (or search) hyperparameter

In [None]:
HYPERPARAMETER_TUNING = True

if HYPERPARAMETER_TUNING:
    # vs. Estimator
    est = TensorFlow(source_directory=SCRIPT_FOLDER,
                     script_params={'--data-folder': ws.get_default_datastore().as_mount()},
                     compute_target=compute_target,
                     entry_script=DEEP_MODEL_TRAIN_SCRIPT, 
                     use_gpu=True)
    
    # vs. GridParameterSampling
    ps = RandomParameterSampling(
        {
    #         '--batch-size': choice(25, 50, 100),
            '--first-layer-neurons': choice(10, 50, 200, 300, 500),
            '--second-layer-neurons': choice(10, 50, 200, 500),
            '--learning-rate': loguniform(-6, -1)
        }
    )

    # Early termnination policy
    policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1)

    hdrc = HyperDriveRunConfig(estimator=est, 
                               hyperparameter_sampling=ps, 
                               policy=policy, 
                               primary_metric_name='validation_acc', 
                               primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, 
                               max_total_runs=8,
                               max_concurrent_runs=4)
    run = exp.submit(config=htc)
else:
    est = TensorFlow(source_directory=SCRIPT_FOLDER,
                     script_params={
                         '--data-folder': ws.get_default_datastore().as_mount(),
                         '--batch-size': 50,
                         '--first-layer-neurons': 300,
                         '--second-layer-neurons': 100,
                         '--learning-rate': 0.01
                     },
                     compute_target=compute_target,
                     entry_script=DEEP_MODEL_TRAIN_SCRIPT, 
                     use_gpu=True)
    run = exp.submit(est)
    
    
RunDetails(run).show()
run.wait_for_completion(show_output=True)


In [None]:
best_run = run.get_best_run_by_primary_metric()
print(best_run.get_file_names())
model = best_run.register_model(model_name='tf-dnn', model_path='outputs/model')



In [None]:
# TODO Checks








# Create AmlCompute cluster


# Create an experiment to track the runs in the workspace
deep_model_exp = Experiment(workspace=ws, name='deep-model')

# Start a run
deep_model_run = exp.start_logging()

# Log a number
deep_model_run.log("my_number", 42)
deep_model_run.log_list("my_list", [1, 2, 3])
deep_model_run.complete()

print(deep_model_run.get_portal_url())



In [None]:
from azureml.train.hyperdrive import *
import math

param_sampling = RandomParameterSampling( {
         'learning_rate': loguniform(math.log(1e-4), math.log(1e-6)),
})

hyperdrive_run_config = HyperDriveRunConfig(
     estimator=estimator,
     hyperparameter_sampling=param_sampling,
     primary_metric_name='f1',
     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
     max_total_runs=16,
     max_concurrent_runs=4)

In [None]:
# Clean-up resources
ws.delete(delete_dependent_resources=True)


### References

* [Fine-tune natural language processing models using Azure Machine Learning service](https://azure.microsoft.com/en-us/blog/fine-tune-natural-language-processing-models-using-azure-machine-learning-service/)
* [Training, hyperparameter tune, and deploy with TensorFlow](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/training-with-deep-learning/train-hyperparameter-tune-deploy-with-tensorflow/train-hyperparameter-tune-deploy-with-tensorflow.ipynb)
