In [1]:
from azureml.core import Workspace, Experiment, ScriptRunConfig

# ws = Workspace.get(name="udacity-project")
ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="udacity-project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: quick-starts-ws-123951
Azure region: southcentralus
Subscription id: 26806ae2-7725-4970-9c73-e6b2c7c706c1
Resource group: aml-quickstarts-123951


In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Creating compute cluster

# naming CPU cluster
mycompute_cluster_name = "sayed-cluster"

# Verifying if the cluster exists
try:
    my_compute = ComputeTarget(workspace=ws, name=mycompute_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',max_nodes=4)
    my_compute = ComputeTarget.create(ws, mycompute_cluster_name, compute_config)

my_compute.wait_for_completion(show_output=True)

Creating
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [3]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import normal, uniform, choice
import os

script_folder = './'
script='train.py'


# Specifying parameter sampler
ps = RandomParameterSampling(
     {
        '--C': choice(0.01,5,20,100,500), 
        '--max_iter': choice(10,50,100,150,200)
     }
)

# Specifying a Policy
policy = BanditPolicy(slack_factor = 0.1, evaluation_interval=2)

#accuracy set as primary metric, focuses on maximizing "accuracy"
primary_metric_name="Accuracy"
primary_metric_goal=PrimaryMetricGoal.MAXIMIZE

max_total_runs=20
max_concurrent_runs=4

if "training" not in os.listdir():
    os.mkdir("./training")

# Creating a SKLearn estimator for use with train.py
est = SKLearn(script_folder,
        compute_target=my_compute, 
        entry_script=script)


# Creating a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(estimator = est,
                             hyperparameter_sampling=ps,
                             policy=policy,
                             primary_metric_name=primary_metric_name,
                             primary_metric_goal=primary_metric_goal,
                             max_total_runs=max_total_runs,
                             max_concurrent_runs=max_concurrent_runs)

In [4]:
# Submiting your hyperdrive run to the experiment 

hyperdrive_run = exp.submit(config = hyperdrive_config, show_output=True)

#Showing run details with the widget.
RunDetails(hyperdrive_run).show()



_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [5]:
hyperdrive_run.get_status()

'Completed'

In [6]:
import joblib
# Getting best run 

best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
best_run_model_names = best_run.get_file_names()

print('Best Run Id: ', best_run.id)
print('\n Accuracy:', best_run_metrics['Accuracy'])
print('\n best_run_model_names:',best_run_model_names)
print('\n best_run:',best_run.get_details())




Best Run Id:  HD_1dab3f3a-1931-4002-ad00-d38498400230_8

 Accuracy: 0.9132018209408195

 best_run_model_names: ['azureml-logs/55_azureml-execution-tvmps_c1160e87dc13acf2e78e2440e844f01e66510c7202fe261c20cd78ebfedfe16e_d.txt', 'azureml-logs/65_job_prep-tvmps_c1160e87dc13acf2e78e2440e844f01e66510c7202fe261c20cd78ebfedfe16e_d.txt', 'azureml-logs/70_driver_log.txt', 'azureml-logs/75_job_post-tvmps_c1160e87dc13acf2e78e2440e844f01e66510c7202fe261c20cd78ebfedfe16e_d.txt', 'azureml-logs/process_info.json', 'azureml-logs/process_status.json', 'logs/azureml/111_azureml.log', 'logs/azureml/job_prep_azureml.log', 'logs/azureml/job_release_azureml.log', 'outputs/mymodel.joblib']

 best_run: {'runId': 'HD_1dab3f3a-1931-4002-ad00-d38498400230_8', 'target': 'sayed-cluster', 'status': 'Completed', 'startTimeUtc': '2020-10-31T19:53:45.347654Z', 'endTimeUtc': '2020-10-31T19:54:33.062608Z', 'properties': {'_azureml.ComputeTargetType': 'amlcompute', 'ContentSnapshotId': '81883a5d-f4dd-45b6-8e9f-3d3c4dcd8a8

In [7]:
best_run

Experiment,Id,Type,Status,Details Page,Docs Page
udacity-project,HD_1dab3f3a-1931-4002-ad00-d38498400230_8,azureml.scriptrun,Completed,Link to Azure Machine Learning studio,Link to Documentation


In [8]:
# saving the best model
model = best_run.register_model(model_name='best_model_sayed', 
                           model_path='outputs/mymodel.joblib')

In [9]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Creating TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"


paths_url = [
    'https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv'
]
dataset = TabularDatasetFactory.from_delimited_files(path=paths_url)

In [10]:
from train import clean_data

# Use the clean_data function to clean your data.
x, y = clean_data(dataset)

In [11]:
import pandas as pd

cleanedData = pd.concat([x,y], axis =1)
cleanedData

Unnamed: 0,age,marital,default,housing,loan,month,day_of_week,duration,campaign,pdays,...,contact_telephone,education_basic.4y,education_basic.6y,education_basic.9y,education_high.school,education_illiterate,education_professional.course,education_university.degree,education_unknown,y
0,57,1,0,0,1,5,1,371,1,999,...,0,0,0,0,1,0,0,0,0,0
1,55,1,0,1,0,5,4,285,2,999,...,1,0,0,0,0,0,0,0,1,0
2,33,1,0,0,0,5,5,52,1,999,...,0,0,0,1,0,0,0,0,0,0
3,36,1,0,0,0,6,5,355,4,999,...,1,0,0,0,1,0,0,0,0,0
4,27,1,0,1,0,7,5,189,2,999,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32945,56,1,0,0,1,7,1,116,1,999,...,0,1,0,0,0,0,0,0,0,0
32946,37,1,0,0,1,7,5,69,7,999,...,0,0,0,0,0,0,0,1,0,0
32947,26,0,0,0,0,5,2,135,4,999,...,0,0,0,0,0,0,0,1,0,0
32948,31,0,0,0,0,4,1,386,1,999,...,0,0,0,1,0,0,0,0,0,0


In [12]:
from sklearn.model_selection import train_test_split

data_train, data_test = train_test_split(cleanedData, test_size=0.2, random_state=111)
data_train

Unnamed: 0,age,marital,default,housing,loan,month,day_of_week,duration,campaign,pdays,...,contact_telephone,education_basic.4y,education_basic.6y,education_basic.9y,education_high.school,education_illiterate,education_professional.course,education_university.degree,education_unknown,y
17428,35,0,0,0,0,8,2,255,1,999,...,0,0,0,0,0,0,1,0,0,0
8403,30,0,0,0,0,6,3,587,1,999,...,1,0,0,0,0,0,0,1,0,1
6111,38,1,0,0,0,5,3,479,2,999,...,1,0,0,0,0,0,1,0,0,0
9960,36,1,0,1,0,5,1,133,1,999,...,1,0,0,0,1,0,0,0,0,0
2571,29,1,0,1,1,8,3,94,2,999,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7490,60,0,0,1,0,6,4,45,2,999,...,1,1,0,0,0,0,0,0,0,0
25257,32,0,0,1,0,8,2,152,3,999,...,0,0,0,0,0,0,0,1,0,0
4820,41,0,0,0,0,5,1,18,4,999,...,0,0,0,0,1,0,0,0,0,0
10196,57,1,0,1,0,8,5,160,1,999,...,0,0,0,0,0,0,1,0,0,0


In [25]:
from azureml.core import Dataset

os.makedirs('data', exist_ok=True)
training_data_path = 'data/trainingdata.csv'
data_train.to_csv(training_data_path)


datastore = ws.get_default_datastore()

datastore.upload(src_dir='data', target_path='data')

my_ds = Dataset.Tabular.from_delimited_files(datastore.path('data/trainingdata.csv'))

Uploading an estimated of 1 files
Uploading data/trainingdata.csv
Uploaded data/trainingdata.csv, 1 files out of an estimated total of 1
Uploaded 1 files


In [30]:
from azureml.train.automl import AutoMLConfig

# Setting parameters for AutoMLConfig
automl_settings = {
    "n_cross_validations": 5,
    "primary_metric": 'accuracy',
    "enable_early_stopping": True,
    "max_concurrent_iterations": 4,
    "max_cores_per_iteration": -1
}
automl_config = AutoMLConfig(task = 'classification',
                            experiment_timeout_minutes=30,
                            compute_target = my_compute,
                            training_data = my_ds,
                            label_column_name = 'y',
                            **automl_settings
                            )

In [31]:
# Submiting automl run
automl_run = exp.submit(config = automl_config, show_output=True)
RunDetails(automl_run).show()

Running on remote.
Running on remote compute: sayed-cluster
Parent Run ID: AutoML_00ec084e-6a44-4c7f-b0fb-39776398ea9b

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+---------------------------------+---------------------------------+--------------------------------------+
|Size of the smallest class   

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [33]:
# Retrieving automl model.

best_run, fitted_model = automl_run.get_output()
best_run


Experiment,Id,Type,Status,Details Page,Docs Page
udacity-project,AutoML_00ec084e-6a44-4c7f-b0fb-39776398ea9b_36,azureml.scriptrun,Completed,Link to Azure Machine Learning studio,Link to Documentation


In [36]:
best_run_metrics = best_run.get_metrics()
best_run_metrics

{'AUC_weighted': 0.9448663441765829,
 'weighted_accuracy': 0.9493859080796201,
 'f1_score_micro': 0.9158573596358117,
 'precision_score_macro': 0.7899137212354728,
 'recall_score_macro': 0.7807917001129325,
 'matthews_correlation': 0.5705735221388888,
 'average_precision_score_macro': 0.8194777525215464,
 'f1_score_weighted': 0.9151865958456433,
 'precision_score_weighted': 0.914612696668675,
 'accuracy': 0.9158573596358117,
 'precision_score_micro': 0.9158573596358117,
 'average_precision_score_micro': 0.9800136909598205,
 'AUC_macro': 0.9448663441765828,
 'recall_score_micro': 0.9158573596358117,
 'log_loss': 0.2339347467840705,
 'AUC_micro': 0.9792397376353099,
 'f1_score_macro': 0.7851640811495533,
 'balanced_accuracy': 0.7807917001129325,
 'average_precision_score_weighted': 0.9539036117547923,
 'norm_macro_recall': 0.5615834002258651,
 'recall_score_weighted': 0.9158573596358117,
 'accuracy_table': 'aml://artifactId/ExperimentRun/dcid.AutoML_00ec084e-6a44-4c7f-b0fb-39776398ea9b_3

In [34]:
fitted_model

Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                                 feature_sweeping_config=None,
                                 feature_sweeping_timeout=None,
                                 featurization_config=None, force_text_dnn=None,
                                 is_cross_validation=None,
                                 is_onnx_compatible=None, logger=None,
                                 observer=None, task=None, working_dir=None)),
                ('prefittedsoftvotingclassifier',...
                                                                                                    min_samples_split=0.15052631578947367,
                                                                                                    min_weight_fraction_leaf=0.0,
                                                                                                    n_estimators=10,
      

In [46]:
# saving AutoML model
best_run.register_model(model_path='./outputs/', model_name='best_automl_model.pkl',
                        tags={'Training context':'Auto ML'},
                        properties={'AUC': best_run_metrics['AUC_weighted'], 'Accuracy': best_run_metrics['accuracy']})

Model(workspace=Workspace.create(name='quick-starts-ws-123951', subscription_id='26806ae2-7725-4970-9c73-e6b2c7c706c1', resource_group='aml-quickstarts-123951'), name=best_automl_model.pkl, id=best_automl_model.pkl:2, version=2, tags={'Training context': 'Auto ML'}, properties={'AUC': '0.9448663441765829', 'Accuracy': '0.9158573596358117'})

In [50]:
# my_compute.delete()
# cluster being deleted
my_compute

AmlCompute(workspace=Workspace.create(name='quick-starts-ws-123951', subscription_id='26806ae2-7725-4970-9c73-e6b2c7c706c1', resource_group='aml-quickstarts-123951'), name=sayed-cluster, id=/subscriptions/26806ae2-7725-4970-9c73-e6b2c7c706c1/resourceGroups/aml-quickstarts-123951/providers/Microsoft.MachineLearningServices/workspaces/quick-starts-ws-123951/computes/sayed-cluster, type=AmlCompute, provisioning_state=Deleting, location=southcentralus, tags=None)

Current provisioning state of AmlCompute is "Deleting"

Current provisioning state of AmlCompute is "Deleting"

