In [1]:
import joblib
import pandas as pd
from azureml.widgets import RunDetails
from utilities.evaluation import ModelEvaluation
from azureml.data.dataset_factory import TabularDatasetFactory
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core import Workspace, Experiment, ScriptRunConfig, Environment

# Optimization
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.parameter_expressions import uniform, quniform

In [2]:
ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="bankmarketing_model")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

Workspace name: quick-starts-ws-134266
Azure region: southcentralus
Subscription id: 5a4ab2ba-6c51-4805-8155-58759ad589d8
Resource group: aml-quickstarts-134266


In [3]:
# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

cpu_cluster_name = 'cpu-cluster'

try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size="Standard_D2_V2",
                                                           max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True)

Creating
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [5]:
sklearn_env = Environment.from_conda_specification(name='sklearn-env',
                                                   file_path='./src/conda_dependencies.yml')

# Create a SKLearn estimator for use with train.py
estimator = ScriptRunConfig(source_directory='./src',
                            script='train.py',
                            arguments=['--C', 1.0, '--max_iter', 100],
                            compute_target=cpu_cluster,
                            environment=sklearn_env)

In [8]:
# Specify parameter sampler

params = {'--C': uniform(0.1, 1),
          '--max_iter': quniform(100, 1500, 100)}

ps = RandomParameterSampling(params)

# Specify a Policy
policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1)


# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(run_config=estimator,
                                     hyperparameter_sampling=ps,
                                     policy=policy,
                                     primary_metric_name='Accuracy',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs=12,
                                     max_concurrent_runs=4)

In [9]:
# Submit your hyperdrive run to the experiment and show run details with the widget.
hyperdrive_run = exp.submit(config=hyperdrive_config)

In [10]:
RunDetails(hyperdrive_run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [11]:
hyperdrive_run.wait_for_completion(show_output=True)

RunId: HD_4c4afcaa-3447-44be-9cdb-9d61fd60e709
Web View: https://ml.azure.com/experiments/bankmarketing_model/runs/HD_4c4afcaa-3447-44be-9cdb-9d61fd60e709?wsid=/subscriptions/5a4ab2ba-6c51-4805-8155-58759ad589d8/resourcegroups/aml-quickstarts-134266/workspaces/quick-starts-ws-134266

Streaming azureml-logs/hyperdrive.txt

"<START>[2021-01-10T19:42:44.205435][API][INFO]Experiment created<END>\n"<START>[2021-01-10T19:42:46.3504764Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.<END>"<START>[2021-01-10T19:42:47.280996][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n""<START>[2021-01-10T19:42:46.896764][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n"

Execution Summary
RunId: HD_4c4afcaa-3447-44be-9cdb-9d61fd60e709
Web View: https://ml.azure.com/experiments/bankmarketing_model/runs/HD_4c4afcaa-3447-44be-9cdb-9d61fd60e709?wsid=/subscriptio

{'runId': 'HD_4c4afcaa-3447-44be-9cdb-9d61fd60e709',
 'target': 'cpu-cluster',
 'status': 'Completed',
 'startTimeUtc': '2021-01-10T19:42:43.625415Z',
 'endTimeUtc': '2021-01-10T19:56:26.983114Z',
 'properties': {'primary_metric_config': '{"name": "Accuracy", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': '04a98ef2-d23e-471f-a1ad-426b74509098',
  'score': '0.9113808801213961',
  'best_child_run_id': 'HD_4c4afcaa-3447-44be-9cdb-9d61fd60e709_7',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://mlstrg134266.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_4c4afcaa-3447-44be-9cdb-9d61fd60e709/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=hWfae4lN8CuRwodUUL7Dmi82w21jkVIM3jAC7uBLjQU%3D&st=2021-01-10T19%3A46%3A35Z&se=2021-01-11T03%3A56%3A35Z&sp=r'}}

In [13]:
assert(hyperdrive_run.get_status() == 'Completed')

In [14]:
best_run = hyperdrive_run.get_best_run_by_primary_metric()
print(best_run.get_details()['runDefinition']['arguments'])

['--C', '1', '--max_iter', '100', '--C', '0.49834091712696993', '--max_iter', '100']


In [15]:
print(best_run.get_file_names())

['azureml-logs/55_azureml-execution-tvmps_534070ab714e6f780ee989ab9584daa76e4badfd26378b06d79f512e007ac360_d.txt', 'azureml-logs/65_job_prep-tvmps_534070ab714e6f780ee989ab9584daa76e4badfd26378b06d79f512e007ac360_d.txt', 'azureml-logs/70_driver_log.txt', 'azureml-logs/75_job_post-tvmps_534070ab714e6f780ee989ab9584daa76e4badfd26378b06d79f512e007ac360_d.txt', 'azureml-logs/process_info.json', 'azureml-logs/process_status.json', 'logs/azureml/107_azureml.log', 'logs/azureml/dataprep/backgroundProcess.log', 'logs/azureml/dataprep/backgroundProcess_Telemetry.log', 'logs/azureml/dataprep/engine_spans_l_ed549d7b-80cb-48eb-b7d8-29a1c358d6db.jsonl', 'logs/azureml/dataprep/python_span_l_ed549d7b-80cb-48eb-b7d8-29a1c358d6db.jsonl', 'logs/azureml/job_prep_azureml.log', 'logs/azureml/job_release_azureml.log', 'outputs/model.joblib']


In [16]:
model = best_run.register_model(model_name='bank-marketing-model', model_path='outputs/model.joblib')

In [17]:
model.download(target_dir='outputs', exist_ok=True)

'outputs/model.joblib'

In [18]:
# Get your best run and save the model from that run.
logistic = joblib.load('outputs/model.joblib')

Trying to unpickle estimator LogisticRegression from version 0.23.2 when using version 0.22.2.post1. This might lead to breaking code or invalid results. Use at your own risk.


In [19]:
# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_test.csv"
ds = TabularDatasetFactory().from_delimited_files(path)

In [46]:
from src.train import clean_data

# Use the clean_data function to clean your data.
test_data, test_label = clean_data(ds)
evaluation = ModelEvaluation(observed=test_label, predicted=logistic.predict(test_data))
evaluation.calculate_metrics()
evaluation.print_metrics()
evaluation.confusion_matrix()

The accuracy is: 0.91
The precision is: 0.68
The recall is: 0.38
The F1 score is: 0.49 



Predicted,0,1
Observed,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.98,0.02
1,0.62,0.38


## AutoML Pipeline

In [21]:
experiment_automl = Experiment(workspace=ws, name='bankmarketing-automl-model')

print(f'Workspace name: {ws.name}',
      f'Azure region: {ws.location}',
      f'Resource group: {ws.resource_group}', sep='\n')

Workspace name: quick-starts-ws-134266
Azure region: southcentralus
Resource group: aml-quickstarts-134266


In [22]:
datastore = ws.get_default_datastore()

factory = TabularDatasetFactory()
path_train_data = 'https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv'
path_test_data = 'https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_test.csv'

ds_train = factory.from_delimited_files(path_train_data)
ds_test = factory.from_delimited_files(path_test_data)

In [23]:
import logging
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.

label = 'y'

automl_settings = {
    'enable_early_stopping': True,
    'iteration_timeout_minutes': 5,
    'max_concurrent_iterations': 4,
    'max_cores_per_iteration': -1,
    'primary_metric': 'accuracy',
    'featurization': 'auto',
    'verbosity': logging.INFO
}

automl_config = AutoMLConfig(experiment_timeout_minutes=30,
                             task='classification',
                             debug_log='automl_errors.log',
                             compute_target=cpu_cluster,
                             training_data=ds_train,
                             label_column_name=label,
                             n_cross_validations=5,
                             **automl_settings)

In [24]:
# Submit your automl run
remote_run = experiment_automl.submit(automl_config, show_output=False)

Running on remote.


In [25]:
RunDetails(remote_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [26]:
remote_run.wait_for_completion(show_output=True)


Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+---------------------------------+---------------------------------+--------------------------------------+
|Size of the smallest class       |Name/Label of the sm

{'runId': 'AutoML_1da94b82-2b04-4b94-adb8-811b06c44566',
 'target': 'cpu-cluster',
 'status': 'Completed',
 'startTimeUtc': '2021-01-10T19:59:44.614953Z',
 'endTimeUtc': '2021-01-10T20:35:01.334255Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '5',
  'target': 'cpu-cluster',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"925a3c3c-7366-4ed7-a375-973e1c54c0a0\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"isArchive\\\\\\": false, \\\\\\"path\\\\\\": {\\\\\\"target\\\\\\": 1, \\\\\\"resourceDetails\\\\\\": [{\\\\\\"path\\\\\\": \\\\\\"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv\\\\\\", \\\\\\"sas\\\\\\": null, \\\\\\"storageAccountName\\\\\\": n

In [27]:
# Retrieve and save your best automl model.

best_run, fitted_model = remote_run.get_output()
print(best_run)
print(fitted_model)

Run(Experiment: bankmarketing-automl-model,
Id: AutoML_1da94b82-2b04-4b94-adb8-811b06c44566_55,
Type: azureml.scriptrun,
Status: Completed)
Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                                 feature_sweeping_config=None,
                                 feature_sweeping_timeout=None,
                                 featurization_config=None, force_text_dnn=None,
                                 is_cross_validation=None,
                                 is_onnx_compatible=None, logger=None,
                                 observer=None, task=None, working_dir=None)),
                ('prefittedsoftvotingclassifier',...
                                                                                               reg_lambda=1.3541666666666667,
                                                                                               scale_pos_weight=1,
           

In [28]:
best_run.get_file_names()

['accuracy_table',
 'automl_driver.py',
 'azureml-logs/55_azureml-execution-tvmps_8b5abdf0a92a600f592d161656a068161e6dde9b134aa1163a4be8fcb10fd0a7_d.txt',
 'azureml-logs/65_job_prep-tvmps_8b5abdf0a92a600f592d161656a068161e6dde9b134aa1163a4be8fcb10fd0a7_d.txt',
 'azureml-logs/70_driver_log.txt',
 'azureml-logs/75_job_post-tvmps_8b5abdf0a92a600f592d161656a068161e6dde9b134aa1163a4be8fcb10fd0a7_d.txt',
 'azureml-logs/process_info.json',
 'azureml-logs/process_status.json',
 'confusion_matrix',
 'logs/azureml/103_azureml.log',
 'logs/azureml/azureml_automl.log',
 'logs/azureml/dataprep/python_span_37859d58-318a-420d-8458-4611e114def8.jsonl',
 'logs/azureml/dataprep/python_span_78cd6d0d-a66c-4a12-a7a0-472422e19f56.jsonl',
 'logs/azureml/dataprep/python_span_7e519ec2-abba-45a5-a59e-41e04d1fdb08.jsonl',
 'logs/azureml/job_prep_azureml.log',
 'logs/azureml/job_release_azureml.log',
 'outputs/conda_env_v_1_0_0.yml',
 'outputs/env_dependencies.json',
 'outputs/model.pkl',
 'outputs/pipeline_graph

In [29]:
best_run.download_file('outputs/model.pkl', 'outputs/model_aml.pkl')
best_run.download_file('outputs/scoring_file_v_1_0_0.py', 'outputs/score_aml.py')
best_run.download_file('automl_driver.py', 'outputs/automl_driver.py')

In [30]:
import pickle

aml_model = pickle.load(open('outputs/model_aml.pkl', 'rb'))

In [37]:
test_data = pd.read_csv(path_test_data)

test_data['y'] = test_data['y'].map({'no': 0, 'yes': 1})

y_test = test_data.pop('y')

In [42]:
predicted = pd.Series(aml_model.predict(test_data)).map({'no': 0, 'yes': 1})

In [45]:
aml_evaluation = ModelEvaluation(observed=y_test, predicted=predicted)
aml_evaluation.calculate_metrics()
aml_evaluation.print_metrics()
aml_evaluation.confusion_matrix()

The accuracy is: 0.92
The precision is: 0.7
The recall is: 0.55
The F1 score is: 0.62 



Predicted,0,1
Observed,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.97,0.03
1,0.45,0.55


In [47]:
try:
    cpu_cluster.delete()
    print('Compute target deleted!')
except ComputeTargetException:
    print('Compute target not found!')

Compute target deleted!
