In [1]:
from os import path, makedirs
experiment_name = 'tensorboard-demo'

# experiment folder
exp_dir = './sample_projects/' + experiment_name

if not path.exists(exp_dir):
    makedirs(exp_dir)

In [2]:
import requests
import os

tf_code = requests.get("https://raw.githubusercontent.com/tensorflow/tensorflow/r1.8/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py")
with open(os.path.join(exp_dir, "mnist_with_summaries.py"), "w") as file:
    file.write(tf_code.text)

In [3]:
import azureml.core
from azureml.core import Workspace
from azureml.core import Experiment

ws = Workspace.from_config()

# create directories for experiment logs and dataset
logs_dir = os.path.join(os.curdir, "logs")
data_dir = os.path.abspath(os.path.join(os.curdir, "mnist_data"))

if not path.exists(data_dir):
    makedirs(data_dir)

os.environ["TEST_TMPDIR"] = data_dir

# Writing logs to ./logs results in their being uploaded to the run history,
# and thus, made accessible to our TensorBoard instance.
args = ["--log_dir", logs_dir]

# Create an experiment
exp = Experiment(ws, experiment_name)

In [4]:
from azureml.core.compute import ComputeTarget, AmlCompute

cluster_name = "cpu-cluster"

cts = ws.compute_targets
found = False
if cluster_name in cts and cts[cluster_name].type == 'AmlCompute':
   found = True
   print('Found existing compute target.')
   compute_target = cts[cluster_name]
if not found:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', 
                                                           max_nodes=4)

    # create the cluster
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True, min_node_count=None)

# use get_status() to get a detailed status for the current cluster. 
# print(compute_target.get_status().serialize())

Found existing compute target.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [8]:
from azureml.core import ScriptRunConfig
from azureml.core import Environment

# Here we will use the TensorFlow 2.2 curated environment
tf_env = Environment.get(ws, 'AzureML-TensorFlow-2.2-GPU')

src = ScriptRunConfig(source_directory=exp_dir,
                      script='mnist_with_summaries.py',
                      arguments=args,
                      compute_target=compute_target,
                      environment=tf_env)
run = exp.submit(src)

In [9]:
run.wait_for_completion(show_output=False)

ActivityFailedException: ActivityFailedException:
	Message: Activity Failed:
{
    "error": {
        "code": "UserError",
        "message": "AzureMLCompute job failed.\nInitializationError: Failed to create Batch AI directory.\n\tReason: Failed to create Job Directory /mnt/batch/tasks/shared/LS_root/jobs/amlbriksews/azureml/tensorboard-demo_1610980377_d6f7af6f/mounts/workspaceblobstore/azureml/tensorboard-demo_1610980377_d6f7af6f/azureml_compute_logs: mkdir /mnt/batch/tasks/shared/LS_root/jobs/amlbriksews/azureml/tensorboard-demo_1610980377_d6f7af6f/mounts/workspaceblobstore: file exists\n\tInfo: Failed to prepare an environment for the job execution: Job environment preparation failed on 10.0.0.4 with err exit status 1.",
        "messageFormat": "{Message}",
        "messageParameters": {
            "Message": "AzureMLCompute job failed.\nInitializationError: Failed to create Batch AI directory.\n\tReason: Failed to create Job Directory /mnt/batch/tasks/shared/LS_root/jobs/amlbriksews/azureml/tensorboard-demo_1610980377_d6f7af6f/mounts/workspaceblobstore/azureml/tensorboard-demo_1610980377_d6f7af6f/azureml_compute_logs: mkdir /mnt/batch/tasks/shared/LS_root/jobs/amlbriksews/azureml/tensorboard-demo_1610980377_d6f7af6f/mounts/workspaceblobstore: file exists\n\tInfo: Failed to prepare an environment for the job execution: Job environment preparation failed on 10.0.0.4 with err exit status 1."
        },
        "details": [],
        "innerError": {
            "code": "BadArgument",
            "innerError": {
                "code": "AmlComputeBadRequest"
            }
        }
    },
    "correlation": {
        "operation": null,
        "request": "3a482786efdddfd4"
    },
    "environment": "westeurope",
    "location": "westeurope",
    "time": "2021-01-18T14:33:21.848878Z",
    "componentName": "execution-worker"
}
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "Activity Failed:\n{\n    \"error\": {\n        \"code\": \"UserError\",\n        \"message\": \"AzureMLCompute job failed.\\nInitializationError: Failed to create Batch AI directory.\\n\\tReason: Failed to create Job Directory /mnt/batch/tasks/shared/LS_root/jobs/amlbriksews/azureml/tensorboard-demo_1610980377_d6f7af6f/mounts/workspaceblobstore/azureml/tensorboard-demo_1610980377_d6f7af6f/azureml_compute_logs: mkdir /mnt/batch/tasks/shared/LS_root/jobs/amlbriksews/azureml/tensorboard-demo_1610980377_d6f7af6f/mounts/workspaceblobstore: file exists\\n\\tInfo: Failed to prepare an environment for the job execution: Job environment preparation failed on 10.0.0.4 with err exit status 1.\",\n        \"messageFormat\": \"{Message}\",\n        \"messageParameters\": {\n            \"Message\": \"AzureMLCompute job failed.\\nInitializationError: Failed to create Batch AI directory.\\n\\tReason: Failed to create Job Directory /mnt/batch/tasks/shared/LS_root/jobs/amlbriksews/azureml/tensorboard-demo_1610980377_d6f7af6f/mounts/workspaceblobstore/azureml/tensorboard-demo_1610980377_d6f7af6f/azureml_compute_logs: mkdir /mnt/batch/tasks/shared/LS_root/jobs/amlbriksews/azureml/tensorboard-demo_1610980377_d6f7af6f/mounts/workspaceblobstore: file exists\\n\\tInfo: Failed to prepare an environment for the job execution: Job environment preparation failed on 10.0.0.4 with err exit status 1.\"\n        },\n        \"details\": [],\n        \"innerError\": {\n            \"code\": \"BadArgument\",\n            \"innerError\": {\n                \"code\": \"AmlComputeBadRequest\"\n            }\n        }\n    },\n    \"correlation\": {\n        \"operation\": null,\n        \"request\": \"3a482786efdddfd4\"\n    },\n    \"environment\": \"westeurope\",\n    \"location\": \"westeurope\",\n    \"time\": \"2021-01-18T14:33:21.848878Z\",\n    \"componentName\": \"execution-worker\"\n}"
    }
}

In [None]:
from azureml.tensorboard import Tensorboard

tb = Tensorboard([run])

# If successful, start() returns a string with the URI of the instance.
tb.start()

# After your job completes, be sure to stop() the streaming otherwise it will continue to run. 
tb.stop()