# Udacity Project 1

### Sandeep Pawar

In [39]:
import pandas as pd

from azureml.core import Workspace, Experiment

# from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling, BayesianParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import choice
from azureml.data.dataset_factory import TabularDatasetFactory
from azureml.widgets import RunDetails
from azureml.train.automl import AutoMLConfig
import os

import warnings
warnings.filterwarnings('ignore')

In [40]:
from azureml.core import Workspace, Experiment

ws =Workspace.from_config()
exp = Experiment(workspace=ws, name="udacity-project")

print(ws.get_details()['id'])

/subscriptions/b968fb36-f06a-4c76-a15f-afab68ae7667/resourceGroups/aml-quickstarts-135432/providers/Microsoft.MachineLearningServices/workspaces/quick-starts-ws-135432


In [41]:
from azureml.core.compute import ComputeTarget, AmlCompute


compute_name = "DS2V2"

try:
    vm = ComputeTarget(ws, compute_name)
    print(f"{compute_name} exists already")
except:
    compute_config = AmlCompute.provisioning_configuration(vm_size="Standard_D2_V2", max_nodes=4)
    vm = ComputeTarget.create(ws, compute_name, compute_config)
    
vm.wait_for_completion(show_output=True)

DS2V2 exists already

Running


In [42]:
# Specify parameter sampler


param_space = { 
                                    "--C"     : choice(100, 10, 1.0, 0.1, 0.01), 
                                    "--solver": choice("lbfgs", "liblinear", "saga", "sag"),
                                    "--reg": choice("l1","l2")            
                                    
              }

sampling = RandomParameterSampling(param_space)

# Specifying Bandit Policy. 
# ROC will be evaluated at every run, starting from 21st run. 
# If the performance in the successive runs is below 91% of the best performing run, HPO will be stopped


policy = BanditPolicy(evaluation_interval=1, slack_factor=0.1, delay_evaluation=20)


if "training" not in os.listdir():
    os.mkdir("./training")
import shutil
shutil.copy('train.py', './training')
    
# Create a SKLearn estimator for use with train.py
est = SKLearn(source_directory='./training', 
              compute_target=vm, 
              entry_script='train.py')

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(estimator=est, 
                                     policy=policy, 
                                     primary_metric_name="AUC",
                                     hyperparameter_sampling=sampling,
                                     max_total_runs=200,
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE)



In [43]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

hpo_run = exp.submit(hyperdrive_config)
RunDetails(hpo_run).show()



_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [45]:
import joblib
# Get your best run and save the model from that run.


best_run = hpo_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()

print('Best Run Id: ', best_run.id)
print('\n AUC:', best_run_metrics['AUC']*100)
print('\n Regularization Strength:',best_run_metrics['Regularization:'])
print('\n Solvers:',best_run_metrics['Solver:'])

Best Run Id:  HD_672c0a8e-e395-4bc0-95de-dd7f9ff3ef9b_5

 AUC: 77.3585596050799

 Regularization Strength: l1

 Solvers: liblinear


## AutoML

In [46]:
from azureml.core import Dataset
# Get a dataset by name and version number
dataset = Dataset.get_by_name(ws, name='Bank-marketing')
# dataset.to_pandas_dataframe()

In [48]:
from train import clean_data

x_df, y_df = clean_data(dataset)

x_df["y"] = y_df

x_df.head(3)

Unnamed: 0,age,marital,default,housing,loan,month,day_of_week,campaign,pdays,previous,...,contact_telephone,education_basic.4y,education_basic.6y,education_basic.9y,education_high.school,education_illiterate,education_professional.course,education_university.degree,education_unknown,y
0,57,1,0,0,1,5,1,1,999,1,...,0,0,0,0,1,0,0,0,0,0
1,55,1,0,1,0,5,4,2,999,0,...,1,0,0,0,0,0,0,0,1,0
2,33,1,0,0,0,5,5,1,999,1,...,0,0,0,1,0,0,0,0,0,0


In [50]:
#Make sure 'duration' column is not present and the features are one hot encoded. 

x_df.columns

Index(['age', 'marital', 'default', 'housing', 'loan', 'month', 'day_of_week',
       'campaign', 'pdays', 'previous', 'poutcome', 'emp.var.rate',
       'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed',
       'job_admin.', 'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'contact_cellular', 'contact_telephone', 'education_basic.4y',
       'education_basic.6y', 'education_basic.9y', 'education_high.school',
       'education_illiterate', 'education_professional.course',
       'education_university.degree', 'education_unknown', 'y'],
      dtype='object')

In [51]:
default_ds = ws.get_default_datastore()
amlds = (TabularDatasetFactory
                .register_pandas_dataframe(x_df,target=(default_ds,'aml'),
                                           name='amldata',
                                           show_progress=True)
        )



Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to aml/3eca8db6-ed67-49a8-9ecd-f3b7779d0ec7/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.


In [52]:
from azureml.train.automl import AutoMLConfig

#Need to use Azure Dataset when working iwth remote computes 

automl_config = AutoMLConfig(
    experiment_timeout_minutes=20,
    task='classification',
    primary_metric='AUC_weighted',
    compute_target=vm,
    training_data=amlds,
    label_column_name='y',
    n_cross_validations=5)

In [53]:
#Submit aml experiment

print('Starting AutoML...')
automl_experiment = Experiment(ws, 'automl')
automl_run = automl_experiment.submit(automl_config)
RunDetails(automl_run).show()
automl_run.wait_for_completion(show_output=True)

Starting AutoML...
Running on remote.


_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…


Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+---------------------------------+---------------------------------+--------------------------------------+
|Size of the smallest class       |Name/Label of the smallest class |Number of samples in the training data|
|3691                             |1    

{'runId': 'AutoML_77603343-0121-4970-b101-32179bbd87f6',
 'target': 'DS2V2',
 'status': 'Completed',
 'startTimeUtc': '2021-01-20T02:33:00.482614Z',
 'endTimeUtc': '2021-01-20T03:01:39.433881Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'AUC_weighted',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '5',
  'target': 'DS2V2',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"a06ed3cc-becb-4506-9532-c71c8579d6f8\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"workspaceblobstore\\\\\\", \\\\\\"path\\\\\\": \\\\\\"aml/3eca8db6-ed67-49a8-9ecd-f3b7779d0ec7/\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"aml-quickstarts-135432\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"b968fb36-f06a-4c76-a15f-afab68ae7667\\\\\\

In [56]:
best_run, fitted_model = automl_run.get_output()

print(best_run)
print(fitted_model)
best_run_metrics = best_run.get_metrics()
for metric_name in best_run_metrics:
    metric = best_run_metrics[metric_name]
    print(metric_name, metric)

Run(Experiment: automl,
Id: AutoML_77603343-0121-4970-b101-32179bbd87f6_19,
Type: azureml.scriptrun,
Status: Completed)
Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                                 feature_sweeping_config=None,
                                 feature_sweeping_timeout=None,
                                 featurization_config=None, force_text_dnn=None,
                                 is_cross_validation=None,
                                 is_onnx_compatible=None, logger=None,
                                 observer=None, task=None, working_dir=None)),
                ('prefittedsoftvotingclassifier',...
                                                                                                    min_impurity_split=None,
                                                                                                    min_samples_leaf=0.01,
                        

In [57]:
# Register the model

from azureml.core import Model

# Register model
best_run.register_model(model_path='outputs/model.pkl', model_name='model_automl',
                        tags={'Training Config':'Auto ML'},
                        properties={'AUC': best_run_metrics['AUC_weighted']})

# List registered models
for model in Model.list(ws):
    print(model.name, 'version:', model.version)
    for tag_name in model.tags:
        tag = model.tags[tag_name]
        print ('\t',tag_name, ':', tag)
    for prop_name in model.properties:
        prop = model.properties[prop_name]
        print ('\t',prop_name, ':', prop)
    print('\n')

model_automl version: 1
	 Training Config : Auto ML
	 AUC : 0.7987412103693771




In [None]:
Delete training cluster

In [58]:
vm.delete()