In [1]:
import os
# [print(f'{k: <25}\t{v}') for k,v in os.environ.items()];

Determine if we're running in a compute environment or a local environment

In [2]:
NOTEBOOK_LOCATION = 'cloud' if os.environ.get('USER','') == 'azureuser' else 'local'
print(NOTEBOOK_LOCATION)

cloud


In [3]:
from azureml.core.authentication import InteractiveLoginAuthentication
if NOTEBOOK_LOCATION == 'local':
    TENANT_ID = input('Tenant ID: ')
    SUBSCRIPTION_ID = input('Subscription ID: ')
    RESOURCE_GROUP = input('Resource group: ')
    WORKSPACE_NAME = input('Workspace name: ')
    auth = InteractiveLoginAuthentication(force=True, tenant_id=TENANT_ID)

# Get or create a Workspace reference

In [4]:
from azureml.core import Workspace
from azureml.core import Experiment

if NOTEBOOK_LOCATION == 'local':
    ws = Workspace.get(
        name=WORKSPACE_NAME,
        subscription_id=SUBSCRIPTION_ID,
        resource_group=RESOURCE_GROUP
    )
else:
    ws = Workspace.from_config()
    
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

Performing interactive authentication. Please follow the instructions on the terminal.
To sign in, use a web browser to open the page https://microsoft.com/devicelogin and enter the code EKGVQNQBX to authenticate.
You have logged in. Now let us find all the subscriptions to which you have access...
Interactive authentication successfully completed.
Workspace name: azureml
Azure region: canadacentral
Subscription id: c6a455b8-27f8-45a1-bb69-60c79f39ac1b
Resource group: opg217757-azureml


# Create compute target

In [5]:
# list existing compute targets
ws.compute_targets.keys()

dict_keys(['notebook'])

In [6]:
from azureml.core.compute import ComputeTarget, AmlCompute

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME", "aml-compute")
vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU", "STANDARD_D2_V2")
compute_min_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MIN_NODES", 2)
compute_max_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MAX_NODES", 4)

if compute_name in ws.compute_targets:
    compute_target = ws.compute_targets[compute_name]
    if compute_target and type(compute_target) is AmlCompute:
        print('Using existing compute target: ' + compute_name)
else:
    compute_config = AmlCompute.provisioning_configuration(
        vm_size=vm_size,
        min_nodes=compute_min_nodes,
        max_nodes=compute_max_nodes
    )
    compute_target = ComputeTarget.create(ws, compute_name, compute_config)

# Submit HyperDrive experiment

In [7]:
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.parameter_expressions import loguniform, choice

hyp_est = SKLearn(
    source_directory='./',
    entry_script='train.py',
    compute_target=compute_target
)
'''
hyperparameters for sklearn.linear_model.LogisticRegression:
penalty='l2'   # ['l1', 'l2', 'elasticnet', 'none']
C=1.0   # [.001, .01, .1, 1, 10, 100, 1000]
solver='lbfgs'   # ['liblinear', 'saga']
max_iter=100   # [125, 150]
multi_class='auto'   # ['multinomial', 'ovr']
'''

# Specify parameter sampler
ps = RandomParameterSampling({
    "penalty": choice('l1', 'l2'),
    "C": loguniform(-4, 3),
    "solver": choice('lbfgs', 'saga')
})

# Specify a Policy for early stopping
policy = BanditPolicy(
    evaluation_interval = 10,
    slack_factor = 0.2
)

In [8]:
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.runconfig import HyperDriveConfig

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(
    estimator=hyp_est,
    hyperparameter_sampling=ps,
    primary_metric_name='norm_macro_recall',
    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
    policy=policy,
    max_total_runs=50,
    max_concurrent_runs=4
)

In [9]:
hyp_exp = Experiment(workspace=ws, name="hyperdrive_bank")

In [10]:
compute_target.wait_for_completion(show_output=True)

Creating
Succeeded..........................................
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [11]:
hyp_run = hyp_exp.submit(config=hyperdrive_config)
hyp_run



Experiment,Id,Type,Status,Details Page,Docs Page
hyperdrive_bank,HD_386097ef-2229-42a7-a7b0-8f1319fbaa12,hyperdrive,Running,Link to Azure Machine Learning studio,Link to Documentation


In [12]:
from azureml.widgets import RunDetails
RunDetails(hyp_run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

We'll leave this processing, submit the AutoML experiment and come back to both models later.

# Submit AutoML experiment

In [13]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

data_file_source = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
ds = TabularDatasetFactory.from_delimited_files(path=data_file_source)

In [14]:
from train import clean_data

# Use the clean_data function to clean your data.
x, y = clean_data(ds)

In [15]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7, random_state=42)

In [16]:
datastore = ws.get_default_datastore()
type(datastore)

azureml.data.azure_storage_datastore.AzureBlobDatastore

In [17]:
x_train.assign(target=y_train).to_csv('df_train.csv')

In [18]:
datastore.upload_files(['df_train.csv'], overwrite=True)

Uploading an estimated of 1 files
Uploading df_train.csv
Uploaded df_train.csv, 1 files out of an estimated total of 1
Uploaded 1 files


$AZUREML_DATAREFERENCE_workspaceblobstore

In [19]:
train_ds = TabularDatasetFactory.from_delimited_files((datastore, 'df_train.csv'))

In [20]:
type(train_ds)

azureml.data.tabular_dataset.TabularDataset

In [21]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.

aml_classification_metrics = [
    'accuracy',
    'AUC_weighted',
    'average_precision_score_weighted',
    'norm_macro_recall',
    'precision_score_weighted'
]

automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task='classification',
    primary_metric='norm_macro_recall',
    training_data=train_ds,
    label_column_name='target',
    compute_target=compute_target,
    n_cross_validations=5
)

In [22]:
compute_target.wait_for_completion(show_output=True)

Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [23]:
# Submit your automl run

from azureml.core.experiment import Experiment

automl_exp = Experiment(ws, "automl_bank")
automl_run = automl_exp.submit(config=automl_config)
automl_run

Running on remote.


Experiment,Id,Type,Status,Details Page,Docs Page
automl_bank,AutoML_4e20e695-2b7a-43c3-ac5b-848ac349161d,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation


# Review, evaluate and register models

In [24]:
from azureml.widgets import RunDetails
RunDetails(automl_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [25]:
hyp_run.wait_for_completion()
automl_run.wait_for_completion()

{'runId': 'AutoML_4e20e695-2b7a-43c3-ac5b-848ac349161d',
 'target': 'aml-compute',
 'status': 'Completed',
 'startTimeUtc': '2020-10-20T00:01:43.515002Z',
 'endTimeUtc': '2020-10-20T00:47:21.378715Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'norm_macro_recall',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '5',
  'target': 'aml-compute',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"e2009077-1493-4e28-a5f2-d56cebd38fd9\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"workspaceblobstore\\\\\\", \\\\\\"path\\\\\\": \\\\\\"df_train.csv\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"opg217757-azureml\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"c6a455b8-27f8-45a1-bb69-60c79f39ac1b\\\\\\", \\\\\\"workspa

In [26]:
hyp_best_run = hyp_run.get_best_run_by_primary_metric()
hyp_best_run_metrics = hyp_best_run.get_metrics()

print('Best Run Id: ', hyp_best_run.id)
print('Accuracy: ', hyp_best_run_metrics['accuracy'])
print('Normalized recall score: ', hyp_best_run_metrics['norm_macro_recall'])

Best Run Id:  HD_386097ef-2229-42a7-a7b0-8f1319fbaa12_1
Accuracy:  0.9111785533636824
Normalized recall score:  0.37662739662739675


In [27]:
# Get your best run and save the model from that run.
hyp_model = hyp_best_run.register_model(
    model_name='bank-marketing-hyperdrive-best-model',
    model_path=os.path.join('outputs','model.joblib')
)

In [28]:
# Retrieve and save your best automl model.
automl_best_run = automl_run.get_best_child()
automl_best_run_metrics = automl_best_run.get_metrics()

print('Best Run Id: ', automl_best_run.id)
print('Accuracy: ', automl_best_run_metrics['accuracy'])
print('Normalized recall score: ', automl_best_run_metrics['norm_macro_recall'])

model = automl_best_run.register_model(
    model_name='bank-marketing-automl-best-model',
    model_path=os.path.join('outputs','model.pkl')
)

Best Run Id:  AutoML_4e20e695-2b7a-43c3-ac5b-848ac349161d_25
Accuracy:  0.8761760242792109
Normalized recall score:  0.7566733183982822


# Delete the compute target

In [31]:
try:
    compute_target.delete()
except:
    print('Already deleted')
else:
    compute_target.wait_for_completion(show_output=True, is_delete_operation=True)

DeletingCurrent provisioning state of AmlCompute is "Deleting"

...Current provisioning state of AmlCompute is "Deleting"

.......Current provisioning state of AmlCompute is "Deleting"

....Current provisioning state of AmlCompute is "Deleting"

.......Current provisioning state of AmlCompute is "Deleting"

....Current provisioning state of AmlCompute is "Deleting"

.......Current provisioning state of AmlCompute is "Deleting"

....Current provisioning state of AmlCompute is "Deleting"

.......Current provisioning state of AmlCompute is "Deleting"

....Current provisioning state of AmlCompute is "Deleting"

.......Current provisioning state of AmlCompute is "Deleting"

....Current provisioning state of AmlCompute is "Deleting"

.......Current provisioning state of AmlCompute is "Deleting"

....
SucceededProvisioning operation finished, operation "Succeeded"
