In [1]:
from azureml.core import Workspace, Experiment

ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="udacity-project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: quick-starts-ws-133168
Azure region: southcentralus
Subscription id: a24a24d5-8d87-4c8a-99b6-91ed2d2df51f
Resource group: aml-quickstarts-133168


In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

cpu_cluster_name = 'cpucluster'

# Verify that cluster does not exist already
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
                                                           max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True)

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [3]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import choice, uniform
import os

In [4]:
# Specify parameter sampler
ps = RandomParameterSampling(
    {
        '--C': uniform(0.0, 1.0), 
        '--max_iter': choice(50, 100, 150, 200, 250)
    }
)
 
# Specify a Policy
policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1)

if "training" not in os.listdir():
    os.mkdir("./training")

# Create a SKLearn estimator for use with train.py

est = SKLearn(source_directory = "./",
            compute_target=cpu_cluster,
            vm_size='STANDARD_D2_V2',
            entry_script="train.py")


# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(estimator=est,
                                     hyperparameter_sampling=ps,
                                     policy=policy,
                                     primary_metric_name='Accuracy',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs=20,
                                     max_concurrent_runs=4)

'SKLearn' estimator is deprecated. Please use 'ScriptRunConfig' from 'azureml.core.script_run_config' with your own defined environment or the AzureML-Tutorial curated environment.


In [5]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

hyperdrive_run = exp.submit(hyperdrive_config)
hyperdrive_run.wait_for_completion(show_output=True)

RunDetails(hyperdrive_run).show()



RunId: HD_87ba3050-d7d0-4798-b35e-e552dd88691d
Web View: https://ml.azure.com/experiments/udacity-project/runs/HD_87ba3050-d7d0-4798-b35e-e552dd88691d?wsid=/subscriptions/a24a24d5-8d87-4c8a-99b6-91ed2d2df51f/resourcegroups/aml-quickstarts-133168/workspaces/quick-starts-ws-133168

Streaming azureml-logs/hyperdrive.txt

"<START>[2021-01-03T14:49:47.145808][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n""<START>[2021-01-03T14:49:46.213284][API][INFO]Experiment created<END>\n"<START>[2021-01-03T14:49:47.4964683Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.<END>"<START>[2021-01-03T14:49:47.598340][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n"

Execution Summary
RunId: HD_87ba3050-d7d0-4798-b35e-e552dd88691d
Web View: https://ml.azure.com/experiments/udacity-project/runs/HD_87ba3050-d7d0-4798-b35e-e552dd88691d?wsid=/subscriptions/a24a2

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [6]:
import joblib
# Get your best run and save the model from that run.

best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run.get_file_names()
os.makedirs('outputs',exist_ok=True)
best_run.download_file(best_run.get_file_names()[-1],
output_file_path='./outputs/')

In [7]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

ds = TabularDatasetFactory.from_delimited_files(['https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv'])

In [8]:
from train import clean_data

# Use the clean_data function to clean your data.
x, y = clean_data(ds)

In [9]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    compute_target=cpu_cluster,
    experiment_timeout_minutes=30,
    task='classification',
    primary_metric='accuracy',
    training_data=ds,
    label_column_name='y',
    enable_onnx_compatible_models=True,
    n_cross_validations=2)

In [10]:
# Submit your automl run

remote_run = exp.submit(automl_config, show_output = False)
remote_run.wait_for_completion()

Running on remote.


{'runId': 'AutoML_7d8e0991-ad3f-4f40-87db-a010ff54dbb8',
 'target': 'cpucluster',
 'status': 'Completed',
 'startTimeUtc': '2021-01-03T15:02:53.436069Z',
 'endTimeUtc': '2021-01-03T15:40:19.630725Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '2',
  'target': 'cpucluster',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"a80ac018-7f94-4a63-b641-4fb3fa8c573e\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"isArchive\\\\\\": false, \\\\\\"path\\\\\\": {\\\\\\"target\\\\\\": 1, \\\\\\"resourceDetails\\\\\\": [{\\\\\\"path\\\\\\": \\\\\\"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv\\\\\\", \\\\\\"sas\\\\\\": null, \\\\\\"storageAccountName\\\\\\": nul

In [11]:
# Retrieve and save your best automl model.

best_run = remote_run.get_output()

print(best_run)

{'f1_score_macro': 0.7662942059852976, 'AUC_macro': 0.9467316252344716, 'precision_score_macro': 0.8010815049508168, 'f1_score_weighted': 0.911480738017321, 'AUC_weighted': 0.9467316252344716, 'matthews_correlation': 0.538686811831369, 'accuracy': 0.9161153262518968, 'norm_macro_recall': 0.48208945926964775, 'average_precision_score_weighted': 0.9550867798996708, 'balanced_accuracy': 0.7410447296348239, 'recall_score_weighted': 0.9161153262518968, 'average_precision_score_micro': 0.9813325147987829, 'f1_score_micro': 0.9161153262518968, 'recall_score_micro': 0.9161153262518968, 'average_precision_score_macro': 0.8235628209533173, 'recall_score_macro': 0.7410447296348239, 'AUC_micro': 0.9805193337953997, 'weighted_accuracy': 0.9596032894579153, 'log_loss': 0.20039017869560205, 'precision_score_weighted': 0.9092671105730274, 'precision_score_micro': 0.9161153262518968, 'confusion_matrix': 'aml://artifactId/ExperimentRun/dcid.AutoML_7d8e0991-ad3f-4f40-87db-a010ff54dbb8_27/confusion_matrix

In [12]:
best_run.get_metrics()

{'f1_score_macro': 0.7662942059852976,
 'AUC_macro': 0.9467316252344716,
 'precision_score_macro': 0.8010815049508168,
 'f1_score_weighted': 0.911480738017321,
 'AUC_weighted': 0.9467316252344716,
 'matthews_correlation': 0.538686811831369,
 'accuracy': 0.9161153262518968,
 'norm_macro_recall': 0.48208945926964775,
 'average_precision_score_weighted': 0.9550867798996708,
 'balanced_accuracy': 0.7410447296348239,
 'recall_score_weighted': 0.9161153262518968,
 'average_precision_score_micro': 0.9813325147987829,
 'f1_score_micro': 0.9161153262518968,
 'recall_score_micro': 0.9161153262518968,
 'average_precision_score_macro': 0.8235628209533173,
 'recall_score_macro': 0.7410447296348239,
 'AUC_micro': 0.9805193337953997,
 'weighted_accuracy': 0.9596032894579153,
 'log_loss': 0.20039017869560205,
 'precision_score_weighted': 0.9092671105730274,
 'precision_score_micro': 0.9161153262518968,
 'confusion_matrix': 'aml://artifactId/ExperimentRun/dcid.AutoML_7d8e0991-ad3f-4f40-87db-a010ff54dbb

In [21]:
cpu_cluster.delete()