In [11]:
from azureml.core import Workspace, Experiment, Environment
import os

# Setting up the workspace
# From a config.json file
ws = Workspace.from_config()

# From a known workspace
# workspace_name = os.environ.get('WORKSPACE_NAME', 'udacity-projects')
# ws = Workspace.get(name=workspace_name)

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

# Setup the experiment
experiment_name = os.environ.get('EXPERIMENT_NAME', 'udacity-project-1')
exp = Experiment(workspace=ws, name=experiment_name)

# Setup the environment
# From a Conda specification file
env = Environment.from_conda_specification(name = "az-ml", file_path = "./envs/env.yml")

# From a pip requirements file
# env = Environment.from_pip_requirements(name = "az-ml", file_path = "path-to-pip-requirements-file")

# Registering and building the environment
# env = env.register(workspace=ws)
# env_build = env.build(workspace=ws)

run = exp.start_logging()

Workspace name: quick-starts-ws-164919
Azure region: southcentralus
Subscription id: 3d1a56d2-7c81-4118-9790-f85d1acf0c77
Resource group: aml-quickstarts-164919


In [13]:
from azureml.core.compute import ComputeTarget, AmlCompute

# Setup the compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.
compute_name = os.environ.get('CLUSTER_NAME', 'udacity-cluster')
compute_min_nodes = os.environ.get('CLUSTER_MIN_NODES', 0)
compute_max_nodes = os.environ.get('CLUSTER_MAX_NODES', 4)
vm_size = os.environ.get('CLUSTER_SKU', 'STANDARD_D2_V2')

# Verify if the compute cluster exists
if compute_name in ws.compute_targets:
    compute_target = ws.compute_targets[compute_name]
    if compute_target and type(compute_target) is AmlCompute:
        print('found compute target. just use it. ' + compute_name)
else:
    print('creating a new compute target...')
    provisioning_config = AmlCompute.provisioning_configuration(
        vm_size=vm_size,
        min_nodes=compute_min_nodes,
        max_nodes=compute_max_nodes)

    # create the cluster
    compute_target = ComputeTarget.create(ws, compute_name, provisioning_config)

    # poll for a minimum number of nodes and for a specific timeout.
    # if no min node count is provided it will use the scale settings for the cluster
    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

     # For a more detailed view of current AmlCompute status, use get_status()
    print(compute_target.get_status().serialize())

creating a new compute target...
InProgress....
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned
{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Resizing', 'allocationStateTransitionTime': '2021-12-02T16:33:56.274000+00:00', 'errors': None, 'creationTime': '2021-12-02T16:33:55.901327+00:00', 'modifiedTime': '2021-12-02T16:34:11.431493+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT1800S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_D2_V2'}


In [16]:
from azureml.widgets import RunDetails
# from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, choice
from azureml.core import ScriptRunConfig

# Setup hyperparameter tuning

# Specify parameter sampler
ps = RandomParameterSampling(
    {
        'C': choice([x*0.001 for x in range(1,1000)]),
        'max_iter': choice(range(100, 500))
    }
)

# Specify a Policy
policy = BanditPolicy(slack_factor=0.1)

# Get the previously registered environment
# env = Environment.get(workspace=ws, name="az-ml")

# Create an estimator for use with train.py and pass in the environment
est = ScriptRunConfig(
    source_directory="./scripts",
    script="train.py",
    compute_target=compute_target,
    environment=env)

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(
    run_config=est,
    hyperparameter_sampling=ps,
    policy=policy,
    primary_metric_name="accuracy",
    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
    max_total_runs=100,
    max_concurrent_runs=4)

In [17]:
# Submit hyperdrive run to the experiment and show run details with the widget.
hyperdrive_run = exp.submit(hyperdrive_config)
RunDetails(hyperdrive_run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [20]:
import joblib

# Get best run and save the model from that run.
best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
parameter_values = best_run.get_details()['runDefinition']['arguments']
print(best_run_metrics)

{'regularization strength:': 0.105, 'max iterations:': 336, 'accuracy': 0.9149065307113377}


In [19]:
print('Best Run Id: ', best_run.id)

for i in best_run_metrics:
    print(i, best_run_metrics[i])

model = best_run.register_model(model_name='bankmkt-hd', model_path='./outputs/bankmkt-hd.joblib')
model.download(target_dir="models", exist_ok=True)

Best Run Id:  HD_cedb6d50-bb1c-4d6a-ae53-e33242d13d7e_4
regularization strength: 0.105
max iterations: 336
accuracy 0.9149065307113377


'models/bankmkt-hd.joblib'

In [21]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
url = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
ds = TabularDatasetFactory.from_delimited_files(url)

In [22]:
from scripts.train import clean_data
import pandas as pd

# Use the clean_data function to clean your data.
x, y = clean_data(ds)

if "training" not in os.listdir():
    os.mkdir("training")

# Since we're using AutoML, we do not need to split data now
# Actually we will rather pass in a joined data object for AutoML
# data_train = x.join(y)
data_train = pd.concat([x, y], axis=1)
data_train.to_csv('./training/data_train.csv')

from azureml.core import Dataset

datastore = ws.get_default_datastore()
datastore.upload(src_dir='./training', target_path='./training')
ds = TabularDatasetFactory.from_delimited_files(datastore.path("./training/data_train.csv"))

training shape: (24712, 39), labels: (24712,) 
testing shape: (8238, 39), labels: (8238,)
Uploading an estimated of 1 files
Uploading ./training/data_train.csv
Uploaded ./training/data_train.csv, 1 files out of an estimated total of 1
Uploaded 1 files


In [23]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task='classification',
    primary_metric='accuracy',
    training_data=ds.to_pandas_dataframe(),
    label_column_name='y',
    n_cross_validations=2)

In [24]:
# Submit automl run
exp = Experiment(workspace=ws, name='bankmkt-automl')
automl_run = exp.submit(automl_config, show_output=False)
automl_run.wait_for_completion(show_output=True)



Experiment,Id,Type,Status,Details Page,Docs Page
bankmkt-automl,AutoML_5ea92d5e-6d2b-4833-a80c-0a886553af00,automl,Preparing,Link to Azure Machine Learning studio,Link to Documentation


INFO:interpret_community.common.explanation_utils:Using default datastore for uploads


Experiment,Id,Type,Status,Details Page,Docs Page
bankmkt-automl,AutoML_5ea92d5e-6d2b-4833-a80c-0a886553af00,automl,Completed,Link to Azure Machine Learning studio,Link to Documentation




****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+---------------------------------+---------------------------------+--------------------------------------+
|Size of the smallest class       |Name/Label of the smallest class |Number of samples in the training data|
|3692                             |1                                |32950                                 |
+---------------------------------+---------------------------------+--------------------------------------+

********************************************

{'runId': 'AutoML_5ea92d5e-6d2b-4833-a80c-0a886553af00',
 'target': 'local',
 'status': 'Completed',
 'startTimeUtc': '2021-12-02T16:59:59.401013Z',
 'endTimeUtc': '2021-12-02T17:19:19.690811Z',
 'services': {},
   'message': 'No scores improved over last 20 iterations, so experiment stopped early. This early stopping behavior can be disabled by setting enable_early_stopping = False in AutoMLConfig for notebook/python SDK runs.'}],
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'AUC_weighted',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '2',
  'target': 'local',
  'DataPrepJsonString': None,
  'EnableSubsampling': None,
  'runTemplate': 'AutoML',
  'azureml.runsource': 'automl',
  'display_task_type': 'classification',
  'dependencies_versions': '{"azureml-widgets": "1.34.0", "azureml-train": "1.34.0", "azureml-train-restclients-hyperdrive": "1.34.0", "azureml-train-core":

In [25]:
# Retrieve and save best automl model.
aml_best_run, model = automl_run.get_output()
print(aml_best_run)
print(model)
joblib.dump(value=aml_best_run.id, filename="./models/bankmkt_automl_AUC.joblib")

Run(Experiment: bankmkt-automl,
Id: AutoML_5ea92d5e-6d2b-4833-a80c-0a886553af00_35,
Type: None,
Status: Completed)
Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=False, enable_feature_sweeping=True, feature_sweeping_config={}, feature_sweeping_timeout=86400, featurization_config=None, force_text_dnn=False, is_cross_validation=True, is_onnx_compatible=False, observer=None, task='classification', working_dir='/mnt/batch/tasks/shared/LS_root/mount...
    gpu_training_param_dict={'processing_unit_type': 'cpu'}
), random_state=0, reg_alpha=0.8333333333333334, reg_lambda=1.9791666666666667, subsample=0.8, tree_method='auto'))], verbose=False))], flatten_transform=None, weights=[0.07142857142857142, 0.2857142857142857, 0.21428571428571427, 0.07142857142857142, 0.07142857142857142, 0.14285714285714285, 0.14285714285714285]))],
         verbose=False)


['./models/bankmkt_automl_AUC.joblib']

In [26]:
compute_target.delete()