In [1]:
from azureml.core import Workspace, Experiment, Environment
import os

# Setting up the workspace
# From a config.json file
ws = Workspace.from_config()

# From a known workspace
# workspace_name = os.environ.get('WORKSPACE_NAME', 'udacity-projects')
# ws = Workspace.get(name=workspace_name)

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

# Setup the experiment
experiment_name = os.environ.get('EXPERIMENT_NAME', 'udacity-project-1')
exp = Experiment(workspace=ws, name=experiment_name)

# Setup the environment
# From a Conda specification file
env = Environment.from_conda_specification(name = "az-ml", file_path = "./envs/env.yml")

# From a pip requirements file
# env = Environment.from_pip_requirements(name = "az-ml", file_path = "path-to-pip-requirements-file")

# Registering and building the environment
# env = env.register(workspace=ws)
# env_build = env.build(workspace=ws)

run = exp.start_logging()

Workspace name: quick-starts-ws-186128
Azure region: southcentralus
Subscription id: 976ee174-3882-4721-b90a-b5fef6b72f24
Resource group: aml-quickstarts-186128


In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute

# Setup the compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.
compute_name = os.environ.get('CLUSTER_NAME', 'udacity-cluster')
compute_min_nodes = os.environ.get('CLUSTER_MIN_NODES', 0)
compute_max_nodes = os.environ.get('CLUSTER_MAX_NODES', 4)
vm_size = os.environ.get('CLUSTER_SKU', 'STANDARD_D2_V2')

# Verify if the compute cluster exists
if compute_name in ws.compute_targets:
    compute_target = ws.compute_targets[compute_name]
    if compute_target and type(compute_target) is AmlCompute:
        print('found compute target. just use it. ' + compute_name)
else:
    print('creating a new compute target...')
    provisioning_config = AmlCompute.provisioning_configuration(
        vm_size=vm_size,
        min_nodes=compute_min_nodes,
        max_nodes=compute_max_nodes)

    # create the cluster
    compute_target = ComputeTarget.create(ws, compute_name, provisioning_config)

    # poll for a minimum number of nodes and for a specific timeout.
    # if no min node count is provided it will use the scale settings for the cluster
    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

     # For a more detailed view of current AmlCompute status, use get_status()
    print(compute_target.get_status().serialize())

creating a new compute target...
InProgress.
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned
{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Resizing', 'allocationStateTransitionTime': '2022-02-16T22:50:34.828000+00:00', 'errors': None, 'creationTime': '2022-02-16T22:50:34.454999+00:00', 'modifiedTime': '2022-02-16T22:50:38.013165+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT1800S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_D2_V2'}


In [3]:
from azureml.widgets import RunDetails
# from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, choice
from azureml.core import ScriptRunConfig

# Setup hyperparameter tuning

# Specify parameter sampler
ps = RandomParameterSampling(
    {
        'C': choice([x*0.001 for x in range(1,1000)]),
        'max_iter': choice(range(100, 500))
    }
)

# Specify a Policy
policy = BanditPolicy(slack_factor=0.1)

# Get the previously registered environment
# env = Environment.get(workspace=ws, name="az-ml")

# Create an estimator for use with train.py and pass in the environment
est = ScriptRunConfig(
    source_directory="./scripts",
    script="train.py",
    compute_target=compute_target,
    environment=env)

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(
    run_config=est,
    hyperparameter_sampling=ps,
    policy=policy,
    primary_metric_name="accuracy",
    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
    max_total_runs=100,
    max_concurrent_runs=4)

In [4]:
# Submit hyperdrive run to the experiment and show run details with the widget.
hyperdrive_run = exp.submit(hyperdrive_config)
RunDetails(hyperdrive_run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

KeyError: 'log_files'

In [5]:
import joblib

# Get best run and save the model from that run.
best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
parameter_values = best_run.get_details()['runDefinition']['arguments']
print(best_run_metrics)

AttributeError: 'NoneType' object has no attribute 'get_metrics'

In [None]:
print('Best Run Id: ', best_run.id)

for i in best_run_metrics:
    print(i, best_run_metrics[i])

model = best_run.register_model(model_name='bankmkt-hd', model_path='./outputs/bankmkt-hd.joblib')
model.download(target_dir="models", exist_ok=True)

In [6]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
url = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
ds = TabularDatasetFactory.from_delimited_files(url)

In [7]:
from scripts.train import clean_data
import pandas as pd

# Use the clean_data function to clean your data.
x, y = clean_data(ds)

if "training" not in os.listdir():
    os.mkdir("training")

# Since we're using AutoML, we do not need to split data now
# Actually we will rather pass in a joined data object for AutoML
# data_train = x.join(y)
data_train = pd.concat([x, y], axis=1)
data_train.to_csv('./training/data_train.csv')

from azureml.core import Dataset

datastore = ws.get_default_datastore()
datastore.upload(src_dir='./training', target_path='./training')
ds = TabularDatasetFactory.from_delimited_files(datastore.path("./training/data_train.csv"))

training shape: (24712, 39), labels: (24712,) 
testing shape: (8238, 39), labels: (8238,)
Uploading an estimated of 1 files
Uploading ./training/data_train.csv
Uploaded ./training/data_train.csv, 1 files out of an estimated total of 1
Uploaded 1 files


"Datastore.upload" is deprecated after version 1.0.69. Please use "Dataset.File.upload_directory" to upload your files             from a local directory and create FileDataset in single method call. See Dataset API change notice at https://aka.ms/dataset-deprecation.


In [8]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task='classification',
    primary_metric='accuracy',
    training_data=ds.to_pandas_dataframe(),
    label_column_name='y',
    n_cross_validations=2)

In [9]:
# Submit automl run
exp = Experiment(workspace=ws, name='bankmkt-automl')
automl_run = exp.submit(automl_config, show_output=False)
automl_run.wait_for_completion(show_output=True)

2022-02-16:22:51:56,60 INFO     [modeling_bert.py:226] Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .
2022-02-16:22:51:56,65 INFO     [modeling_xlnet.py:339] Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .
2022-02-16:22:52:29,273 INFO     [utils.py:159] NumExpr defaulting to 4 threads.


Experiment,Id,Type,Status,Details Page,Docs Page
bankmkt-automl,AutoML_7293599a-b8d8-4e36-8af3-6f1640ef194f,automl,Preparing,Link to Azure Machine Learning studio,Link to Documentation


In [None]:
# Retrieve and save best automl model.
aml_best_run, model = automl_run.get_output()
print(aml_best_run)
print(model)
joblib.dump(value=aml_best_run.id, filename="./models/bankmkt_automl.joblib")

In [None]:
compute_target.delete()
print('Compute cluster deleted!')