In [1]:
from azureml.core import Workspace, Experiment

ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="udacity-project-8")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run_hyperdrive = exp.start_logging()

Workspace name: udacity_training
Azure region: westeurope
Subscription id: b6039200-c49c-40db-80f3-83b6dc2b92c8
Resource group: predictive_parametring_studs_fiz


In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "tutunjiancluster"

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

try:
    p1_compute = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
                                                           max_nodes=4)
    p1_compute = ComputeTarget.create(ws, cluster_name, compute_config)

p1_compute.wait_for_completion(show_output=True)

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [3]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import choice, uniform
from azureml.core import Environment, ScriptRunConfig
import os

# Specify parameter sampler
ps = RandomParameterSampling(
    {
        '--C': choice(1, 2, 3, 4, 5),
        '--max_iter': choice(50, 100, 150, 200, 250, 300)
    }
)

# Specify a Policy
policy = BanditPolicy(evaluation_interval=3, slack_factor=0.1)

if "training" not in os.listdir():
    os.mkdir("./training")

# Setup environment for your training run
sklearn_env = Environment.from_conda_specification(name='sklearn-env', file_path='conda_dependencies.yml')

# Create a ScriptRunConfig Object to specify the configuration details of your training job
src = ScriptRunConfig(source_directory="./training",
                      script='train.py',
                      compute_target=p1_compute,
                      environment=sklearn_env)

# Create a HyperDriveConfig using the src object, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(run_config=src,
                                     hyperparameter_sampling=ps,
                                     policy=policy,
                                     primary_metric_name='Accuracy',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs=20,
                                     max_concurrent_runs=4)

In [4]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

run_hyperdrive = exp.submit(hyperdrive_config)

In [6]:
RunDetails(run_hyperdrive).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [8]:
run_hyperdrive.wait_for_completion(show_output=True)

RunId: HD_409f7389-90d2-4c43-93a5-46691fbb511a
Web View: https://ml.azure.com/runs/HD_409f7389-90d2-4c43-93a5-46691fbb511a?wsid=/subscriptions/b6039200-c49c-40db-80f3-83b6dc2b92c8/resourcegroups/predictive_parametring_studs_fiz/workspaces/udacity_training&tid=69c3b14b-e78e-400a-a3d2-7c0ae48f5375

Streaming azureml-logs/hyperdrive.txt

"<START>[2022-02-08T20:39:37.516355][API][INFO]Experiment created<END>\n""<START>[2022-02-08T20:39:38.527996][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n""<START>[2022-02-08T20:39:39.236792][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n""<START>[2022-02-08T20:40:08.494862][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n""<START>[2022-02-08T20:40:08.802362][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n"<START>[2022-02-08T20:40:37.9394635Z][SCHEDULER][INFO]Scheduling job, 

{'runId': 'HD_409f7389-90d2-4c43-93a5-46691fbb511a',
 'target': 'tutunjiancluster',
 'status': 'Completed',
 'startTimeUtc': '2022-02-08T20:39:37.243533Z',
 'endTimeUtc': '2022-02-08T20:54:46.249072Z',
 'services': {},
 'properties': {'primary_metric_config': '{"name": "Accuracy", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': '4665de55-0a12-4eb8-bf3a-d3e23b0b90a3',
  'user_agent': 'python/3.6.9 (Linux-5.4.0-1064-azure-x86_64-with-debian-buster-sid) msrest/0.6.21 Hyperdrive.Service/1.0.0 Hyperdrive.SDK/core.1.37.0',
  'space_size': '30',
  'score': '0.9170498436637852',
  'best_child_run_id': 'HD_409f7389-90d2-4c43-93a5-46691fbb511a_1',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://udacitytrainin4287654940.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_409f7389-90d2-4c43-93a5-46691

In [11]:
import joblib
# Get your best run and save the model from that run.
os.makedirs("./models", exist_ok=True)
best_run = run_hyperdrive.get_best_run_by_primary_metric()
print('arguments of best run', best_run.get_details()['runDefinition']['arguments'])

best_run_metrics = best_run.get_metrics()

print('ID of the best run: ', best_run.id)
print('Accuracy of the best run:', best_run_metrics['Accuracy'])
best_run.download_file("/outputs/model.joblib", "./models/sklearn_best_model.joblib")


arguments of best run ['--C', '1', '--max_iter', '200']
ID of the best run:  HD_409f7389-90d2-4c43-93a5-46691fbb511a_1
Accuracy of the best run: 0.9170498436637852


In [12]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

ds1 = TabularDatasetFactory.from_delimited_files("https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv", 
validate=True, 
include_path=False, 
infer_column_types=True, 
set_column_types=None, 
separator=',', 
header=True,
partition_format=None, 
support_multi_line=False, 
empty_as_string=False, 
encoding='utf8')

In [13]:
# Use the clean_data function to clean your data.
import pandas as pd
def clean_data(data):
    # Dict for cleaning data
    months = {"jan":1, "feb":2, "mar":3, "apr":4, "may":5, "jun":6, "jul":7, "aug":8, "sep":9, "oct":10, "nov":11, "dec":12}
    weekdays = {"mon":1, "tue":2, "wed":3, "thu":4, "fri":5, "sat":6, "sun":7}
    x_df = data.to_pandas_dataframe().dropna()
    jobs = pd.get_dummies(x_df.job, prefix="job")
    x_df.drop("job", inplace=True, axis=1)
    x_df = x_df.join(jobs)
    x_df["marital"] = x_df.marital.apply(lambda s: 1 if s == "married" else 0)
    x_df["default"] = x_df.default.apply(lambda s: 1 if s == "yes" else 0)
    x_df["housing"] = x_df.housing.apply(lambda s: 1 if s == "yes" else 0)
    x_df["loan"] = x_df.loan.apply(lambda s: 1 if s == "yes" else 0)
    contact = pd.get_dummies(x_df.contact, prefix="contact")
    x_df.drop("contact", inplace=True, axis=1)
    x_df = x_df.join(contact)
    education = pd.get_dummies(x_df.education, prefix="education")
    x_df.drop("education", inplace=True, axis=1)
    x_df = x_df.join(education)
    x_df["month"] = x_df.month.map(months)
    x_df["day_of_week"] = x_df.day_of_week.map(weekdays)
    x_df["poutcome"] = x_df.poutcome.apply(lambda s: 1 if s == "success" else 0)

    x_df['y'] = x_df.y.apply(lambda s: 1 if s == "yes" else 0)
    return x_df


In [14]:
x = clean_data(ds1)
x.head()

Unnamed: 0,age,marital,default,housing,loan,month,day_of_week,duration,campaign,pdays,...,contact_cellular,contact_telephone,education_basic.4y,education_basic.6y,education_basic.9y,education_high.school,education_illiterate,education_professional.course,education_university.degree,education_unknown
0,57,1,0,0,1,5,1,371,1,999,...,1,0,0,0,0,1,0,0,0,0
1,55,1,0,1,0,5,4,285,2,999,...,0,1,0,0,0,0,0,0,0,1
2,33,1,0,0,0,5,5,52,1,999,...,1,0,0,0,1,0,0,0,0,0
3,36,1,0,0,0,6,5,355,4,999,...,0,1,0,0,0,1,0,0,0,0
4,27,1,0,1,0,7,5,189,2,999,...,1,0,0,0,0,1,0,0,0,0


In [15]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task= "classification",
    primary_metric= "accuracy",
    training_data= x,
    label_column_name= 'y',
    n_cross_validations= 4)

In [16]:
# Submit your automl run

run_autoML = exp.submit(automl_config, show_output=True)

No run_configuration provided, running on local with default configuration
Running in the active local environment.


2022-02-08:21:31:33,650 INFO     [modeling_bert.py:226] Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .
2022-02-08:21:31:33,655 INFO     [modeling_xlnet.py:339] Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .
2022-02-08:21:31:36,741 INFO     [utils.py:159] NumExpr defaulting to 4 threads.


Experiment,Id,Type,Status,Details Page,Docs Page
udacity-project-8,AutoML_8726f60b-36cd-4d51-90fa-4987ee7f1ca4,automl,Preparing,Link to Azure Machine Learning studio,Link to Documentation


Current status: DatasetEvaluation. Gathering dataset statistics.
Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetFeaturizationCompleted. Completed fit featurizers and featurizing the dataset.
Current status: DatasetBalancing. Performing class balancing sweeping
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.

********************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one cl



In [18]:
# Retrieve and save your best automl model.

best_run_autoML, best_model_autoML = run_autoML.get_output()
best_model_autoML
joblib.dump(best_model_autoML, './models/automl-best-model.joblib')

['./models/automl-best-model.joblib']