In [1]:
from azureml.core import Workspace, Experiment

ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="udacity-project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

To sign in, use a web browser to open the page https://microsoft.com/devicelogin and enter the code E9W9DZ2WS to authenticate.


In [4]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "ml-project-opt1"

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

### YOUR CODE HERE ###
try:
    compute_target = ComputeTarget(workspace=ws, name = cluster_name)
    print("Found existing cluster")
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',max_nodes=4)
    compute_target = ComputeTarget.create(ws,cluster_name,compute_config)
compute_target.wait_for_completion(show_output = True)

InProgress...............
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [10]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import choice, uniform
from azureml.core import Environment, ScriptRunConfig
import os

# Specify parameter sampler
parameter = { "--C" : choice(0.001,0.01, 0.1, 1, 10, 100, 200, 1000), "--max_iter" : choice(10, 50, 100, 200, 500, 1000) }
ps = RandomParameterSampling(parameter)

# Specify a Policy
policy = BanditPolicy(evaluation_interval=2,slack_factor=0.1)

if "training" not in os.listdir():
    os.mkdir("./training")

# Setup environment for your training run
sklearn_env = Environment.from_conda_specification(name='sklearn-env', file_path='conda_dependencies.yml')

# Create a ScriptRunConfig Object to specify the configuration details of your training job
src = ScriptRunConfig(source_directory='.',
                            script='train.py',
                            # arguments=['--arg1', parameter['--C'], '--arg2', parameter['--max_iter']],
                            compute_target=cluster_name,
                            environment=sklearn_env)

# Create a HyperDriveConfig using the src object, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(run_config=src,
                             hyperparameter_sampling=ps,
                             policy=policy,
                             primary_metric_name="Accuracy",
                             primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                             max_total_runs=40,
                             max_concurrent_runs=4)

In [11]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

### YOUR CODE HERE ###
hyperdrive_run = exp.submit(hyperdrive_config)

RunDetails(hyperdrive_run).show()
hyperdrive_run.wait_for_completion(show_output=True)

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [21]:
import joblib
# Get your best run and save the model from that run.

### YOUR CODE HERE ###
best = hyperdrive_run.get_best_run_by_primary_metric()
best.get_details()
print(hyperdrive_run.get_hyperparameters())


print("\nBest Metrics and parameters")
print(best.get_metrics())


{'HD_3c52880f-2250-4b63-821c-c53acb7c9a36_0': '{"--C": 0.001, "--max_iter": 10}', 'HD_3c52880f-2250-4b63-821c-c53acb7c9a36_1': '{"--C": 0.1, "--max_iter": 100}', 'HD_3c52880f-2250-4b63-821c-c53acb7c9a36_2': '{"--C": 0.001, "--max_iter": 500}', 'HD_3c52880f-2250-4b63-821c-c53acb7c9a36_3': '{"--C": 0.001, "--max_iter": 100}', 'HD_3c52880f-2250-4b63-821c-c53acb7c9a36_4': '{"--C": 100, "--max_iter": 200}', 'HD_3c52880f-2250-4b63-821c-c53acb7c9a36_5': '{"--C": 0.001, "--max_iter": 1000}', 'HD_3c52880f-2250-4b63-821c-c53acb7c9a36_6': '{"--C": 1000, "--max_iter": 100}', 'HD_3c52880f-2250-4b63-821c-c53acb7c9a36_7': '{"--C": 1000, "--max_iter": 50}', 'HD_3c52880f-2250-4b63-821c-c53acb7c9a36_8': '{"--C": 0.1, "--max_iter": 10}', 'HD_3c52880f-2250-4b63-821c-c53acb7c9a36_9': '{"--C": 100, "--max_iter": 500}', 'HD_3c52880f-2250-4b63-821c-c53acb7c9a36_10': '{"--C": 200, "--max_iter": 10}', 'HD_3c52880f-2250-4b63-821c-c53acb7c9a36_11': '{"--C": 0.01, "--max_iter": 10}', 'HD_3c52880f-2250-4b63-821c-c5

In [25]:
import os
print(os.getcwd())

/mnt/batch/tasks/shared/LS_root/mounts/clusters/ml-project/code/Users/odl_user_215159


In [36]:
model = best.register_model(model_name='hpt_model', model_path='.')


In [37]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
url = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

### YOUR CODE HERE ###
ds = TabularDatasetFactory.from_delimited_files(url)

In [38]:
from train import clean_data

# Use the clean_data function to clean your data.
x, y = clean_data(ds)

### Saving Best Hyper Parameter Model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.25,random_state = 76)
# best_parameters
c = 100
max_iter = 200
lr_model = LogisticRegression(C=c, max_iter=max_iter).fit(x_train, y_train)
joblib.dump(lr_model, filename = 'best_hyper_para_model.sav')

In [44]:
x.head()

Unnamed: 0,age,marital,default,housing,loan,month,day_of_week,duration,campaign,pdays,...,contact_cellular,contact_telephone,education_basic.4y,education_basic.6y,education_basic.9y,education_high.school,education_illiterate,education_professional.course,education_university.degree,education_unknown
0,57,1,0,0,1,5,1,371,1,999,...,1,0,0,0,0,1,0,0,0,0
1,55,1,0,1,0,5,4,285,2,999,...,0,1,0,0,0,0,0,0,0,1
2,33,1,0,0,0,5,5,52,1,999,...,1,0,0,0,1,0,0,0,0,0
3,36,1,0,0,0,6,5,355,4,999,...,0,1,0,0,0,1,0,0,0,0
4,27,1,0,1,0,7,5,189,2,999,...,1,0,0,0,0,1,0,0,0,0


In [46]:
x.corr()

Unnamed: 0,age,marital,default,housing,loan,month,day_of_week,duration,campaign,pdays,...,contact_cellular,contact_telephone,education_basic.4y,education_basic.6y,education_basic.9y,education_high.school,education_illiterate,education_professional.course,education_university.degree,education_unknown
age,1.0,0.269381,0.002098,3.446979e-07,-0.007736,0.079432,-0.017167,-0.003085,0.000726,-0.037783,...,-0.005926,0.005926,0.243342,0.013124,-0.038858,-0.109628,0.019829,0.005201,-0.068805,0.06294
marital,0.2693811,1.0,0.007695,-0.007299599,-0.000857,0.013479,0.000421,-0.005531,0.002236,0.029046,...,-0.062103,0.062103,0.115424,0.080411,0.070426,-0.071771,0.014298,-0.003817,-0.103378,3e-05
default,0.002097518,0.007695,1.0,-0.003623044,-0.004037,0.011196,-0.00663,-0.005711,-0.004241,0.001873,...,0.007224,-0.007224,-0.003201,-0.002325,-0.003969,0.002363,-0.000204,0.015358,-0.006171,-0.002007
housing,3.446979e-07,-0.0073,-0.003623,1.0,0.053693,0.030002,-0.005982,-0.005065,-0.011009,-0.010612,...,0.082675,-0.082675,-0.012307,-0.01002,-0.000798,-0.008966,0.000443,0.016412,0.014026,-0.009072
loan,-0.007735566,-0.000857,-0.004037,0.05369264,1.0,-0.004832,-0.001672,-0.005312,0.004934,0.000671,...,0.009012,-0.009012,-0.001113,-0.005657,-0.006863,8.3e-05,0.002866,0.000509,0.01074,-0.005425
month,0.07943181,0.013479,0.011196,0.03000209,-0.004832,1.0,-0.007383,-0.021745,-0.031788,-0.083171,...,0.32368,-0.32368,-0.033108,-0.057527,-0.078315,-0.036955,0.005521,0.034065,0.126013,-0.012178
day_of_week,-0.01716694,0.000421,-0.00663,-0.00598158,-0.001672,-0.007383,1.0,0.008583,0.014436,0.003347,...,-0.02393,0.02393,-0.00449,0.006401,0.005722,-0.005468,0.004322,-0.000884,-0.003457,0.009587
duration,-0.003085231,-0.005531,-0.005711,-0.005064525,-0.005312,-0.021745,0.008583,1.0,-0.071795,-0.047941,...,0.029684,-0.029684,0.004123,0.007366,0.005662,0.008879,0.003339,-0.008918,-0.013269,0.001415
campaign,0.0007260389,0.002236,-0.004241,-0.01100933,0.004934,-0.031788,0.014436,-0.071795,1.0,0.053326,...,-0.078979,0.078979,0.001848,-0.002889,-0.003066,0.002306,-0.002793,0.002838,-0.002777,0.002989
pdays,-0.0377833,0.029046,0.001873,-0.01061205,0.000671,-0.083171,0.003347,-0.047941,0.053326,1.0,...,-0.119879,0.119879,0.006162,0.024775,0.03455,0.008479,-0.003341,-0.004831,-0.03799,-0.021655


In [45]:
y.value_counts()

0    29258
1     3692
Name: y, dtype: int64

In [42]:
x

Unnamed: 0,age,marital,default,housing,loan,month,day_of_week,duration,campaign,pdays,...,contact_cellular,contact_telephone,education_basic.4y,education_basic.6y,education_basic.9y,education_high.school,education_illiterate,education_professional.course,education_university.degree,education_unknown
0,57,1,0,0,1,5,1,371,1,999,...,1,0,0,0,0,1,0,0,0,0
1,55,1,0,1,0,5,4,285,2,999,...,0,1,0,0,0,0,0,0,0,1
2,33,1,0,0,0,5,5,52,1,999,...,1,0,0,0,1,0,0,0,0,0
3,36,1,0,0,0,6,5,355,4,999,...,0,1,0,0,0,1,0,0,0,0
4,27,1,0,1,0,7,5,189,2,999,...,1,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32945,56,1,0,0,1,7,1,116,1,999,...,1,0,1,0,0,0,0,0,0,0
32946,37,1,0,0,1,7,5,69,7,999,...,1,0,0,0,0,0,0,0,1,0
32947,26,0,0,0,0,5,2,135,4,999,...,1,0,0,0,0,0,0,0,1,0
32948,31,0,0,0,0,4,1,386,1,999,...,1,0,0,0,1,0,0,0,0,0


### AutoML

In [47]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task="classification",
    primary_metric="accuracy",
    training_data=ds,
    label_column_name='y',
    n_cross_validations=5)

In [48]:
# Submit your automl run

### YOUR CODE HERE ###
auto_run = exp.submit(config=automl_config, show_output=True)

No run_configuration provided, running on local with default configuration
Running in the active local environment.


Experiment,Id,Type,Status,Details Page,Docs Page
udacity-project,AutoML_811a3b43-3b3a-42fa-968f-9f149bdfc3e7,automl,Preparing,Link to Azure Machine Learning studio,Link to Documentation


Current status: DatasetEvaluation. Gathering dataset statistics.
Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetFeaturizationCompleted. Completed fit featurizers and featurizing the dataset.
Current status: DatasetBalancing. Performing class balancing sweeping
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.

********************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one cl

In [None]:
# Retrieve and save your best automl model.

### YOUR CODE HERE ###
best_run, model = auto_run.get_output()

#### Best AutoML model

In [None]:
best_run

In [None]:
best_run.get_metrics()

In [None]:
joblib.dump(model, filename = 'best_automl_model.sav')

In [None]:
model

#### Cluster clean up

In [None]:
compute_target.delete()