In [6]:
import logging
import os
import csv

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
import pkg_resources

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.core.dataset import Dataset

from azureml.pipeline.steps import AutoMLStep

from preprocessing import get_hyperd_data, main

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.42.0


In [5]:
ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

quick-starts-ws-199195
aml-quickstarts-199195
northcentralus
6971f5ac-8af1-446e-8034-05acea24681f


In [7]:
exp = Experiment(workspace=ws, name="udacity-project")
run = exp.start_logging()

In [8]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException

# NOTE: update the cluster name to match the existing cluster
# Choose a name for your CPU cluster
compute_cluster_name = "ndeg-prj2-clust"

# Verify that cluster does not exist already
try:
    compute_cluster = ComputeTarget(workspace=ws, name=compute_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',# for GPU, use "STANDARD_NC6"
                                                           #vm_priority = 'lowpriority', # optional
                                                           max_nodes=4)
    compute_cluster = ComputeTarget.create(ws, compute_cluster_name, compute_config)

compute_cluster.wait_for_completion(show_output=True)
# For a more detailed view of current AmlCompute status, use get_status().

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [None]:
train_ds, test_ds = get_hyperd_data(ws)

In [9]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import choice, uniform
from azureml.core import Environment, ScriptRunConfig
import os

# Specify parameter sampler
ps = RandomParameterSampling( {
    '--n_estimators': choice(range(2, 100)),
    '--max_depth': choice(range(2, 10)),
    '--max_features': choice(range(1, 14)),
    '--min_samples_leaf': uniform(0.01, 0.5)
})

# Specify a Policy
policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1) # evaluate performance every two runs,
                                                               # stop if lower than 1% point difference to
                                                               # best result in previous two runs

if "training" not in os.listdir():
    os.mkdir("./training")

# Setup environment for your training run
sklearn_env = Environment.from_conda_specification(name='sklearn-env', file_path='conda_dependencies.yml')

# Create a ScriptRunConfig Object to specify the configuration details of your training job
src = ScriptRunConfig(
    source_directory=".",
    script="preprocessing.py",
    #arguments=['--input-data', train_ds.as_named_input('train')],
    compute_target=compute_cluster, # use the previously created compute cluster
    environment=sklearn_env
)

# Create a HyperDriveConfig using the src object, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(run_config=src,
    hyperparameter_sampling=ps,
    policy=policy,
    primary_metric_name='accuracy',
    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
    max_total_runs=20,
    max_concurrent_runs=3)

In [10]:
# Submit your hyperdrive run to the experiment and show run details with the widget.
hyperdrive_run = exp.submit(config=hyperdrive_config)
RunDetails(hyperdrive_run).show()
hyperdrive_run.wait_for_completion(show_output=True)

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

RunId: HD_65a6b025-f9b1-4eb4-ad20-9dd8f4e35f33
Web View: https://ml.azure.com/runs/HD_65a6b025-f9b1-4eb4-ad20-9dd8f4e35f33?wsid=/subscriptions/6971f5ac-8af1-446e-8034-05acea24681f/resourcegroups/aml-quickstarts-199195/workspaces/quick-starts-ws-199195&tid=660b3398-b80e-49d2-bc5b-ac1dc93b5254

Streaming azureml-logs/hyperdrive.txt

"<START>[2022-06-23T14:58:19.633748][API][INFO]Experiment created<END>\n""<START>[2022-06-23T14:58:20.206359][GENERATOR][INFO]Trying to sample '3' jobs from the hyperparameter space<END>\n"<START>[2022-06-23T14:58:21.2030477Z][SCHEDULER][INFO]Scheduling job, id='HD_65a6b025-f9b1-4eb4-ad20-9dd8f4e35f33_0'<END><START>[2022-06-23T14:58:21.2577784Z][SCHEDULER][INFO]Scheduling job, id='HD_65a6b025-f9b1-4eb4-ad20-9dd8f4e35f33_1'<END><START>[2022-06-23T14:58:21.4167512Z][SCHEDULER][INFO]Scheduling job, id='HD_65a6b025-f9b1-4eb4-ad20-9dd8f4e35f33_2'<END>"<START>[2022-06-23T14:58:21.344606][GENERATOR][INFO]Successfully sampled '3' jobs, they will soon be submitted to 

{'runId': 'HD_65a6b025-f9b1-4eb4-ad20-9dd8f4e35f33',
 'target': 'ndeg-prj2-clust',
 'status': 'Completed',
 'startTimeUtc': '2022-06-23T14:58:19.280766Z',
 'endTimeUtc': '2022-06-23T15:09:20.989975Z',
 'services': {},
 'properties': {'primary_metric_config': '{"name": "accuracy", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': 'f736bab7-691b-4f22-888e-cd13a3876573',
  'user_agent': 'python/3.8.5 (Linux-5.4.0-1083-azure-x86_64-with-glibc2.10) msrest/0.6.21 Hyperdrive.Service/1.0.0 Hyperdrive.SDK/core.1.42.0',
  'space_size': 'infinite_space_size',
  'score': '0.8344032498224114',
  'best_child_run_id': 'HD_65a6b025-f9b1-4eb4-ad20-9dd8f4e35f33_0',
  'best_metric_status': 'Succeeded',
  'best_data_container_id': 'dcid.HD_65a6b025-f9b1-4eb4-ad20-9dd8f4e35f33_0'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://mlstrg199195.blob.co

In [None]:
main()