# Distributed PyTorch with Horovod

In [40]:
# Check core SDK version number
import azureml.core

print("SDK version:", azureml.core.VERSION)

SDK version: 1.33.0


In [2]:
from azureml.telemetry import set_diagnostics_collection

set_diagnostics_collection(send_diagnostics=True)

Turning diagnostics collection on. 


In [3]:
import uuid

# Create a model folder in the current directory
os.makedirs('./model', exist_ok=True)
timeline_dir = "./model/horovod-timeline/%s" % uuid.uuid4()
os.makedirs(timeline_dir, exist_ok=True)
os.environ['HOROVOD_TIMELINE'] = timeline_dir + "/horovod_timeline.json"

In [4]:
from azureml.core.workspace import Workspace

ws = Workspace.from_config()
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep='\n')

Workspace name: ramwar-ml
Azure region: westeurope
Subscription id: f51eb4a8-7d45-4f2f-a966-ca0283c0701a
Resource group: ramwar-ml


## Attach existing Cluster

In [24]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "ramwar-gpu-node"
compute_target = ComputeTarget(workspace=ws, name=cluster_name)
print('Found existing cluster.')
print(compute_target.get_status().serialize())

Found existing cluster.
{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2021-10-06T06:23:08.071000+00:00', 'errors': None, 'creationTime': '2021-10-06T06:22:55.237765+00:00', 'modifiedTime': '2021-10-06T06:23:11.013971+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 3, 'nodeIdleTimeBeforeScaleDown': 'PT2400S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC6'}


## Train model on the remote compute

In [32]:
import os

project_folder = './pytorch-distr-hvd'
os.makedirs(project_folder, exist_ok=True)

### Create an experiment

In [33]:
from azureml.core import Experiment

experiment_name = 'pytorch-distr-hvd'
experiment = Experiment(ws, name=experiment_name)

### Create an environment

In [34]:
from azureml.core import Environment

pytorch_env = Environment.get(ws, name='AzureML-PyTorch-1.6-GPU')

### Configure the training job

In [41]:
from azureml.core import ScriptRunConfig
from azureml.core.runconfig import MpiConfiguration

src = ScriptRunConfig(source_directory=project_folder,
                      script='pytorch_horovod_mnist.py',
                      #script='pytorch_synthetic_benchmark.py',
                      compute_target=compute_target,
                      environment=pytorch_env,
                      distributed_job_config=MpiConfiguration(node_count=3))

### Submit Job

As the run is executed, it goes through the following stages:

* __Preparing__: A docker image is created according to the environment defined. The image is uploaded to the workspace's container registry and cached for later runs. Logs are also streamed to the run history and can be viewed to monitor progress. If a curated environment is specified instead, the cached image backing that curated environment will be used.

* __Scaling__: The cluster attempts to scale up if the Batch AI cluster requires more nodes to execute the run than are currently available.

* __Running__: All scripts in the script folder are uploaded to the compute target, data stores are mounted or copied, and the script is executed. Outputs from stdout and the ./logs folder are streamed to the run history and can be used to monitor the run.

* __Post-Processing__: The ./outputs folder of the run is copied over to the run history.


In [42]:
run = experiment.submit(src)
print(run)

Run(Experiment: pytorch-distr-hvd,
Id: pytorch-distr-hvd_1633504887_7d69d02d,
Type: azureml.scriptrun,
Status: Starting)


### Register model

In [37]:
model = run.register_model(model_name='pytorch-mnist', model_path='outputs/model.pt')

ModelPathNotFoundException: ModelPathNotFoundException:
	Message: Could not locate the provided model_path outputs/model.pt in the set of files uploaded to the run: []
                See https://aka.ms/run-logging for more details.
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "Could not locate the provided model_path outputs/model.pt in the set of files uploaded to the run: []\n                See https://aka.ms/run-logging for more details."
    }
}

In [43]:
from azureml.widgets import RunDetails

RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': True, 'log_level': 'INFO', 's…

### Waiting for training completion

In [44]:
run.wait_for_completion(show_output=True) # this provides a verbose log

RunId: pytorch-distr-hvd_1633504887_7d69d02d
Web View: https://ml.azure.com/runs/pytorch-distr-hvd_1633504887_7d69d02d?wsid=/subscriptions/f51eb4a8-7d45-4f2f-a966-ca0283c0701a/resourcegroups/ramwar-ml/workspaces/ramwar-ml&tid=de6197ce-5e7c-42b4-bb3b-6b60ed5dee9f

Streaming azureml-logs/55_azureml-execution-tvmps_b2ed8fec555266c0d2b1816656946cd435982d4a99266fc43feb028aabc25d8a_d.txt

2021-10-06T07:21:46Z Successfully mounted a/an Blobfuse File System at /mnt/batch/tasks/shared/LS_root/jobs/ramwar-ml/azureml/pytorch-distr-hvd_1633504887_7d69d02d/mounts/workspaceblobstore
2021-10-06T07:21:47Z Failed to start nvidia-fabricmanager due to exit status 5 with output Failed to start nvidia-fabricmanager.service: Unit nvidia-fabricmanager.service not found.
. Please ignore this if the GPUs don't utilize NVIDIA® NVLink® switches.
2021-10-06T07:21:47Z Starting output-watcher...
2021-10-06T07:21:47Z IsDedicatedCompute == True, won't poll for Low Pri Preemption
Login Succeeded
Using default tag: lat

{'runId': 'pytorch-distr-hvd_1633504887_7d69d02d',
 'target': 'ramwar-gpu-node',
 'status': 'Completed',
 'startTimeUtc': '2021-10-06T07:21:43.806196Z',
 'endTimeUtc': '2021-10-06T07:23:28.135297Z',
 'properties': {'_azureml.ComputeTargetType': 'amlcompute',
  'ContentSnapshotId': '57449850-eaf0-4980-9f69-2842d557a130',
  'ProcessInfoFile': 'azureml-logs/process_info.json',
  'ProcessStatusFile': 'azureml-logs/process_status.json'},
 'inputDatasets': [],
 'outputDatasets': [],
 'runDefinition': {'script': 'pytorch_horovod_mnist.py',
  'command': '',
  'useAbsolutePath': False,
  'arguments': [],
  'sourceDirectoryDataStore': None,
  'framework': 'Python',
  'communicator': 'Mpi',
  'target': 'ramwar-gpu-node',
  'dataReferences': {},
  'data': {},
  'outputData': {},
  'datacaches': [],
  'jobName': None,
  'maxRunDurationSeconds': 2592000,
  'nodeCount': 3,
  'instanceTypes': [],
  'priority': None,
  'credentialPassthrough': False,
  'identity': None,
  'environment': {'name': 'Azure

### Download model

In [20]:
# Download the model from run history
run.download_file(name='outputs/model.pt', output_file_path='./model/model.pt')
#run.download_file(name='outputs/model.onnx', output_file_path='./model/model.onnx')
