In [1]:
from azureml.core import Workspace, Experiment
ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="udacity-project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: optimize_ml_pipeline
Azure region: eastus
Subscription id: 6fe6105a-78a2-472c-b075-5840c51586de
Resource group: udacity-azure-optim-ml-pipeline


In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute

cluster_name = "compute-cluster"
# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.
compute_config = AmlCompute.provisioning_configuration(vm_size="Standard_D2_V2", max_nodes=4)
compute_target = ComputeTarget.create(ws, cluster_name, compute_config)
compute_target.wait_for_completion(show_output=True)


InProgress..
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [3]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import choice, uniform
from azureml.core import Environment, ScriptRunConfig
import os

# Specify parameter sampler
ps = RandomParameterSampling({
    '--C': uniform(0.01, 10),
    '--max_iter': choice(50, 100, 150)
})

# Specify a Policy
policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1)

if "training" not in os.listdir():
    os.mkdir("./training")

# Setup environment for your training run
sklearn_env = Environment.from_conda_specification(name='sklearn-env', file_path='conda_dependencies.yml')

# Create a ScriptRunConfig Object to specify the configuration details of your training job
src = ScriptRunConfig(source_directory='.', script='train.py', compute_target=compute_target, environment=sklearn_env)

# Create a HyperDriveConfig using the src object, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(run_config=src, hyperparameter_sampling=ps, policy=policy, 
                                     primary_metric_name='Accuracy', primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs=20, max_concurrent_runs=4)


In [4]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

hyperdrive_run = exp.submit(hyperdrive_config)
RunDetails(hyperdrive_run).show()
hyperdrive_run.wait_for_completion(show_output=True)


2024-06-30 09:26:09.724130: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /anaconda/envs/azureml_py38/lib/python3.9/site-packages/cv2/../../lib64:
2024-06-30 09:26:09.724165: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
Failed to load image Python extension: libc10_cuda.so: cannot open shared object file: No such file or directory


_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

RunId: HD_494a6949-b797-4214-b166-cdf2a152b8c8
Web View: https://ml.azure.com/runs/HD_494a6949-b797-4214-b166-cdf2a152b8c8?wsid=/subscriptions/6fe6105a-78a2-472c-b075-5840c51586de/resourcegroups/udacity-azure-optim-ml-pipeline/workspaces/optimize_ml_pipeline&tid=46076513-b9f3-4767-b9a6-524bc3785643

Streaming azureml-logs/hyperdrive.txt

[2024-06-30T09:25:48.773354][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space
[2024-06-30T09:25:49.3666013Z][SCHEDULER][INFO]Scheduling job, id='HD_494a6949-b797-4214-b166-cdf2a152b8c8_0' 
[2024-06-30T09:25:49.5599889Z][SCHEDULER][INFO]Scheduling job, id='HD_494a6949-b797-4214-b166-cdf2a152b8c8_1' 
[2024-06-30T09:25:49.6089683Z][SCHEDULER][INFO]Scheduling job, id='HD_494a6949-b797-4214-b166-cdf2a152b8c8_2' 
[2024-06-30T09:25:49.657800][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.
[2024-06-30T09:25:49.7944546Z][SCHEDULER][INFO]Scheduling job, id='HD_494a6949-b797-4214-b166-cdf

{'runId': 'HD_494a6949-b797-4214-b166-cdf2a152b8c8',
 'target': 'compute-cluster',
 'status': 'Completed',
 'startTimeUtc': '2024-06-30T09:25:47.489995Z',
 'endTimeUtc': '2024-06-30T09:38:58.85453Z',
 'services': {},
 'properties': {'primary_metric_config': '{"name":"Accuracy","goal":"maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': '6431f120-2ee0-4a3e-b413-22c28d27edcb',
  'user_agent': 'python/3.9.19 (Linux-5.15.0-1064-azure-x86_64-with-glibc2.31) msrest/0.7.1 Hyperdrive.Service/1.0.0 Hyperdrive.SDK/core.1.56.0',
  'space_size': 'infinite_space_size',
  'best_child_run_id': 'HD_494a6949-b797-4214-b166-cdf2a152b8c8_13',
  'score': '0.9089529590288316',
  'best_metric_status': 'Succeeded',
  'best_data_container_id': 'dcid.HD_494a6949-b797-4214-b166-cdf2a152b8c8_13'},
 'inputDatasets': [],
 'outputDatasets': [],
 'runDefinition': {'configuration': None,
  'attribution': None,
  'telemet

In [6]:
import joblib

# Get your best run and save the model from that run.
best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_model = best_run.register_model(model_name='best_model', model_path='./outputs/model.joblib')


In [7]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

### YOUR CODE HERE ###
data_path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
ds = TabularDatasetFactory.from_delimited_files(data_path)


In [8]:
from train import clean_data

# Use the clean_data function to clean your data.
x, y = clean_data(ds)


{'infer_column_types': 'False', 'activity': 'to_pandas_dataframe'}
{'infer_column_types': 'False', 'activity': 'to_pandas_dataframe', 'activityApp': 'TabularDataset'}


In [9]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task='classification',
    primary_metric='accuracy',
    training_data=ds,
    label_column_name='y',
    n_cross_validations=5, 
    compute_target=compute_target
    )


In [10]:
# Submit your automl run

### YOUR CODE HERE ###
automl_run = exp.submit(automl_config, show_output=True)


Submitting remote run.
No run_configuration provided, running on compute-cluster with default configuration
Running on remote compute: compute-cluster


Experiment,Id,Type,Status,Details Page,Docs Page
udacity-project,AutoML_fd9eb1a3-5b84-4e00-acbc-28ec958bcddf,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation



Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetBalancing. Generating individually featurized CV splits.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

********************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+------------------------------+--------------------------------+---------------------------

In [11]:
# Retrieve and save your best automl model.

### YOUR CODE HERE ###
best_automl_run, fitted_model = automl_run.get_output()
best_automl_run.register_model(model_name='best_automl_model', model_path='./outputs/model.pkl')


2024-06-30 10:58:20.402808: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2024-06-30 10:58:25.673681: E tensorflow/stream_executor/cuda/cuda_driver.cc:328] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2024-06-30 10:58:25.673790: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (aspjscript1): /proc/driver/nvidia/version does not exist


Model(workspace=Workspace.create(name='optimize_ml_pipeline', subscription_id='6fe6105a-78a2-472c-b075-5840c51586de', resource_group='udacity-azure-optim-ml-pipeline'), name=best_automl_model, id=best_automl_model:2, version=2, tags={}, properties={})

In [12]:
# Delete the compute cluster

compute_target.delete()
print("Compute cluster deleted.")


Compute cluster deleted.
Current provisioning state of AmlCompute is "Deleting"

Current provisioning state of AmlCompute is "Deleting"

Current provisioning state of AmlCompute is "Deleting"

