In [57]:
# verify installation and check Azure ML SDK version
import azureml.core

print('SDK version:', azureml.core.VERSION)

SDK version: 1.17.0


In [58]:
from azureml.core.workspace import Workspace

ws = Workspace.from_config()
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

datastore = ws.get_default_datastore()
print("Default datastore's name: {}".format(datastore.name))

Workspace name: HPO-Workspace-Nanthini
Azure region: eastus
Subscription id: 73612009-b37b-413f-a3f7-ec02f12498cf
Resource group: RAPIDS-HPO-Nanthini
Default datastore's name: workspaceblobstore


In [59]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# choose a name for your cluster
gpu_cluster_name = 'gpu-cluster'

if gpu_cluster_name in ws.compute_targets:
    gpu_cluster = ws.compute_targets[gpu_cluster_name]
    if gpu_cluster and type(gpu_cluster) is AmlCompute:
        print('Found compute target. Will use {0} '.format(gpu_cluster_name))
else:
    print('creating new cluster')
    # m_size parameter below could be modified to one of the RAPIDS-supported VM types
    provisioning_config = AmlCompute.provisioning_configuration(vm_size = 'Standard_NC6s_v3', max_nodes = 5, idle_seconds_before_scaledown = 300)
    # Use VM types with more than one GPU for multi-GPU option, e.g. Standard_NC12s_v3
    
    # create the cluster
    gpu_cluster = ComputeTarget.create(ws, gpu_cluster_name, provisioning_config)
    
    # can poll for a minimum number of nodes and for a specific timeout 
    # if no min node count is provided it uses the scale settings for the cluster
    gpu_cluster.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)
    
# use get_status() to get a detailed status for the current cluster 
print(gpu_cluster.get_status().serialize())

Found compute target. Will use gpu-cluster 
{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2020-11-06T21:08:52.544000+00:00', 'errors': None, 'creationTime': '2020-11-06T18:44:21.840391+00:00', 'modifiedTime': '2020-11-06T18:44:38.935614+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 5, 'nodeIdleTimeBeforeScaleDown': 'PT300S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC6S_V3'}


In [60]:
from azureml.core.workspace import Workspace

ws = Workspace.from_config()
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

datastore = ws.get_default_datastore()
print("Default datastore's name: {}".format(datastore.name))

Workspace name: HPO-Workspace-Nanthini
Azure region: eastus
Subscription id: 73612009-b37b-413f-a3f7-ec02f12498cf
Resource group: RAPIDS-HPO-Nanthini
Default datastore's name: workspaceblobstore


In [61]:
from azureml.core import Experiment

experiment_name = 'optuna_rapids'
experiment = Experiment(ws, name=experiment_name)

In [62]:
from azureml.core import Environment

# create the environment
rapids_env = Environment('rapids_env')

# create the environment inside a Docker container
rapids_env.docker.enabled = True

# specify docker steps as a string. Alternatively, load the string from a file
dockerfile = """
FROM rapidsai/rapidsai:0.16-cuda10.2-runtime-ubuntu18.04-py3.7
RUN apt-get update && \
apt-get install -y fuse && \
apt-get install libssl1.0.0 libssl-dev && \
source activate rapids && \
pip install azureml-sdk==1.13.0 && \
pip install azureml-widgets && \
pip install optuna && \
pip install dask_optuna && \
pip install fusepy
"""

# set base image to None since the image is defined by dockerfile
rapids_env.docker.enabled = True
rapids_env.docker.base_image = None
rapids_env.docker.base_dockerfile = dockerfile

# use rapids environment in the container
rapids_env.python.user_managed_dependencies = True

In [63]:
from azureml.core.dataset import Dataset
bnp_ds = Dataset.File.from_files("https://kagglebnpdataset.blob.core.windows.net/data/bnp_train.csv")
bnp_ds.download(target_path='data/')
path_on_datastore = 'bnp_upload'
datastore.upload(src_dir='data/', target_path=path_on_datastore, overwrite=False, show_progress=True)

ds_data = datastore.path(path_on_datastore)
dataset = Dataset.File.from_files(ds_data)

In [64]:
script_params = ['--data_dir', bnp_ds.as_named_input('bnp_input').as_mount(),
]
from azureml.core import ScriptRunConfig

project_folder ="./"
src = ScriptRunConfig(source_directory=project_folder,
                      script='train_optuna.py',
                      arguments=script_params,
                      compute_target="gpu-cluster",
                      environment=rapids_env)

# Set compute target
# Skip this if you are running on your local computer
# src.run_config.target = gpu_cluster

In [65]:
run = experiment.submit(config=src)


In [66]:
run.wait_for_completion(show_output=True)

RunId: optuna_rapids_1604700066_1c2d31bc
Web View: https://ml.azure.com/experiments/optuna_rapids/runs/optuna_rapids_1604700066_1c2d31bc?wsid=/subscriptions/73612009-b37b-413f-a3f7-ec02f12498cf/resourcegroups/RAPIDS-HPO-Nanthini/workspaces/HPO-Workspace-Nanthini

Streaming azureml-logs/55_azureml-execution-tvmps_20d88174fa3e7c6c499639580a6932da81cf902230f1e12c5f607c2a625f3b41_d.txt

2020-11-06T22:05:16Z Starting output-watcher...
2020-11-06T22:05:16Z IsDedicatedCompute == True, won't poll for Low Pri Preemption
2020-11-06T22:05:18Z Executing 'Copy ACR Details file' on 10.0.0.5
2020-11-06T22:05:18Z Copy ACR Details file succeeded on 10.0.0.5. Output: 
>>>   
>>>   
Login Succeeded
Using default tag: latest
latest: Pulling from azureml/azureml_d446b97076f6c20a6046fbda4aba0d62
171857c49d0f: Pulling fs layer
419640447d26: Pulling fs layer
61e52f862619: Pulling fs layer
c118dad7e37a: Pulling fs layer
29c091e4be16: Pulling fs layer
d85c81a4428d: Pulling fs layer
e6ba6b94dd40: Pulling fs laye

ActivityFailedException: ActivityFailedException:
	Message: Activity Failed:
{
    "error": {
        "code": "UserError",
        "message": "AzureMLCompute job failed.\nJobFailed: Submitted script failed with a non-zero exit code; see the driver log file for details.",
        "details": []
    },
    "correlation": {
        "operation": null,
        "request": "b15c057097c1251d"
    },
    "environment": "eastus",
    "location": "eastus",
    "time": "2020-11-06T22:09:35.094116Z",
    "componentName": "execution-worker"
}
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "Activity Failed:\n{\n    \"error\": {\n        \"code\": \"UserError\",\n        \"message\": \"AzureMLCompute job failed.\\nJobFailed: Submitted script failed with a non-zero exit code; see the driver log file for details.\",\n        \"details\": []\n    },\n    \"correlation\": {\n        \"operation\": null,\n        \"request\": \"b15c057097c1251d\"\n    },\n    \"environment\": \"eastus\",\n    \"location\": \"eastus\",\n    \"time\": \"2020-11-06T22:09:35.094116Z\",\n    \"componentName\": \"execution-worker\"\n}"
    }
}