In [1]:
from azureml.core import Workspace, Experiment

ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="udacity-project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: nnudacityworkspace
Azure region: eastus
Subscription id: 2e29b216-8fb1-4e85-9aed-88f9d1978cd6
Resource group: mlstudio_udacity


In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "NNComputeCluster"

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

# Verify that cluster does not exist already
try:
    aml_compute = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_E4DS_V4',
                                                           max_nodes=4)
    aml_compute = ComputeTarget.create(ws, cluster_name, compute_config)
#aml_compute.wait_for_completion(show_output=True)

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [17]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import choice, uniform
from azureml.core import Environment, ScriptRunConfig
import os

# Specify parameter sampler
ps = RandomParameterSampling(
    {
    '--C': uniform(0.1, 1.0),
    '--max_iter': choice(100, 150, 200, 250, 300)
    }
)
# Specify a Policy
policy = BanditPolicy(slack_factor=0.1, evaluation_interval=2)

if "training" not in os.listdir():
    os.mkdir("./training")

# Setup environment for your training run
sklearn_env = Environment.from_conda_specification(name='sklearn-env', file_path='conda_dependencies.yml')

# Create a ScriptRunConfig Object to specify the configuration details of your training job
src = ScriptRunConfig(source_directory='./training',
                      script='train.py',
                      arguments=['--C', 1.0, '--max_iter', 100],
                      compute_target=aml_compute,
                      environment=sklearn_env)

# Create a HyperDriveConfig using the src object, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(run_config=src,
                                     hyperparameter_sampling=ps,
                                     policy=policy,
                                     primary_metric_name='Accuracy',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs=20,
                                     max_concurrent_runs=4)

In [18]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

hyperdrive_run = exp.submit(hyperdrive_config)
RunDetails(hyperdrive_run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

In [19]:
hyperdrive_run.wait_for_completion(show_output=True)

RunId: HD_adc87844-b1ef-4817-9686-782065c90d0b
Web View: https://ml.azure.com/runs/HD_adc87844-b1ef-4817-9686-782065c90d0b?wsid=/subscriptions/2e29b216-8fb1-4e85-9aed-88f9d1978cd6/resourcegroups/mlstudio_udacity/workspaces/nnudacityworkspace&tid=ca55feda-17c1-4417-8e67-f85f2e6a3c22

Streaming azureml-logs/hyperdrive.txt

[2023-12-11T06:49:07.181707][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space
[2023-12-11T06:49:07.6339153Z][SCHEDULER][INFO]Scheduling job, id='HD_adc87844-b1ef-4817-9686-782065c90d0b_0' 
[2023-12-11T06:49:07.8178474Z][SCHEDULER][INFO]Scheduling job, id='HD_adc87844-b1ef-4817-9686-782065c90d0b_1' 
[2023-12-11T06:49:07.9387806Z][SCHEDULER][INFO]Scheduling job, id='HD_adc87844-b1ef-4817-9686-782065c90d0b_2' 
[2023-12-11T06:49:08.0121199Z][SCHEDULER][INFO]Scheduling job, id='HD_adc87844-b1ef-4817-9686-782065c90d0b_3' 
[2023-12-11T06:49:07.967837][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.
[20

{'runId': 'HD_adc87844-b1ef-4817-9686-782065c90d0b',
 'target': 'NNComputeCluster',
 'status': 'Completed',
 'startTimeUtc': '2023-12-11T06:49:06.454671Z',
 'endTimeUtc': '2023-12-11T07:08:24.960Z',
 'services': {},
 'properties': {'primary_metric_config': '{"name":"Accuracy","goal":"maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': '707f62bb-765f-48d1-8eec-da2eae166ca8',
  'user_agent': 'python/3.8.5 (Linux-5.15.0-1040-azure-x86_64-with-glibc2.10) msrest/0.7.1 Hyperdrive.Service/1.0.0 Hyperdrive.SDK/core.1.51.0',
  'space_size': 'infinite_space_size',
  'best_child_run_id': 'HD_adc87844-b1ef-4817-9686-782065c90d0b_19',
  'score': '0.917754172989378',
  'best_metric_status': 'Succeeded',
  'best_data_container_id': 'dcid.HD_adc87844-b1ef-4817-9686-782065c90d0b_19'},
 'inputDatasets': [],
 'outputDatasets': [],
 'runDefinition': {'configuration': None,
  'attribution': None,
  'telemetryV

In [20]:
import joblib
# Get your best run and save the model from that run.

best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
print('Best Run Id: ', best_run.id)
print('\n Accuracy:', best_run_metrics['Accuracy'])

Best Run Id:  HD_adc87844-b1ef-4817-9686-782065c90d0b_19

 Accuracy: 0.9177541729893779


In [24]:
model_file_name = 'model.pkl'
best_run.download_file(name='outputs/' + model_file_name, output_file_path=model_file_name)

In [26]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

data_url = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

# Create a TabularDataset
ds = TabularDatasetFactory.from_delimited_files(path=data_url)

In [28]:
from training.train import clean_data

# Use the clean_data function to clean your data.
x, y = clean_data(ds)

In [33]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.

automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task='classification',
    primary_metric='accuracy',
    training_data=ds,  # your dataset
    label_column_name='y',  # replace with your label column name
    n_cross_validations=5
)

In [34]:
# Submit your automl run

# Create an Azure ML experiment in your workspace
experiment = Experiment(workspace=ws, name='automl_experiment_project')

# Submit the experiment
automl_run = experiment.submit(automl_config, show_output=True)

No run_configuration provided, running on local with default configuration
Running in the active local environment.


Experiment,Id,Type,Status,Details Page,Docs Page
automl_experiment_project,AutoML_95ed4bcb-b7f7-42c8-a212-f5c9f8cf862d,automl,Preparing,Link to Azure Machine Learning studio,Link to Documentation


Current status: DatasetEvaluation. Gathering dataset statistics.
Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetFeaturizationCompleted. Completed fit featurizers and featurizing the dataset.
Current status: DatasetBalancing. Performing class balancing sweeping
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.





********************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+------------------------------+--------------------------------+--------------------------------------+
|Size of the smallest class    |Name/Label of the smallest class|Number of samples in the training data|
|3692                          |yes                             |32950                                 |
+------------------------------+--------------------------------+--------------------------------------+

*********************************************************************

2023-12-11:08:00:16,473 INFO     [explanation_client.py:334] Using default datastore for uploads


Current status: EngineeredFeatureExplanations. Computation of engineered features completed
Current status: RawFeaturesExplanations. Computation of raw features started
Current status: RawFeaturesExplanations. Computation of raw features completed
Current status: BestRunExplainModel. Best run model explanations completed
********************************************************************************************


In [35]:
# Retrieve and save your best automl model.

best_run, fitted_model = automl_run.get_output()

# Print the best run
print("Best Run:", best_run)
# Save the best model
model_file_name = 'best_automl_model.pkl'
joblib.dump(fitted_model, model_file_name)
print(f"Model saved to: {model_file_name}")

Best Run: Run(Experiment: automl_experiment_project,
Id: AutoML_95ed4bcb-b7f7-42c8-a212-f5c9f8cf862d_32,
Type: None,
Status: Completed)
Model saved to: best_automl_model.pkl
