# Jupyter notebook for Optimizing an Azure ML Pipeline 


## Part 1: Hyperparameter Tuning for a chosen model [Logistic Regression]
### Initiate a _workspace_ and start an _experiment_

In [1]:
from azureml.core import Workspace, Experiment

# Get the Workspace and Experiment objects running 
ws = Workspace.from_config()

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

exp_name = "UCnavAMLproject01"
exp = Experiment(workspace=ws, name=exp_name)

run = exp.start_logging()

Performing interactive authentication. Please follow the instructions on the terminal.
Interactive authentication successfully completed.
Workspace name: quick-starts-ws-258212
Azure region: westeurope
Subscription id: 976ee174-3882-4721-b90a-b5fef6b72f24
Resource group: aml-quickstarts-258212


To sign in, use a web browser to open the page https://microsoft.com/devicelogin and enter the code L7J5Y82RR to authenticate.


### Create and configure a compute cluster

In [4]:
from azureml.core.compute import ComputeTarget, AmlCompute

cluster_name = "UCnavProject01"

# Create a compute cluster
# Use vm_size = "Standard_DS3_v2" in provisioning_configuration.
# max_nodes should be no greater than 4.

try: 
    compute_cluster = ComputeTarget(workspace = ws, name = cluster_name)  # checking for an existing compute cluster
    print('Existing compute cluster found! \n')
except ComputeTargetException:
    print('Creating a new compute cluster... \n')
    compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_DS3_v2', min_nodes=0, max_nodes=4)
    compute_cluster = ComputeTarget.create(workspace = ws, name = cluster_name, provisioning_configuration=compute_config)  

# Display the status and details of the Compute resources 
compute_cluster.wait_for_completion(show_output = True)
print('\n Compute Cluster details: \n', compute_cluster.get_status().serialize()) 

compute_resources = ws.compute_targets
for resource_name, resource_type in compute_resources.items():
    print('\n', resource_name, resource_type.type, resource_type.provisioning_state)

Existing cluster found! 


Running
Compute Cluster details: 
 {'errors': [], 'creationTime': '2024-04-23T12:17:20.868209+00:00', 'createdBy': {'userObjectId': '7ed85bb2-82b4-4698-b957-ed49ef858ced', 'userTenantId': '660b3398-b80e-49d2-bc5b-ac1dc93b5254', 'userName': 'ODL_User 258212'}, 'modifiedTime': '2024-04-23T12:18:52.122137+00:00', 'state': 'Running', 'vmSize': 'Standard_DS3_v2'}


### Imports

In [None]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.sampling import BayesianParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import choice, uniform
from azureml.core import Environment, ScriptRunConfig
import os



### Parameter Sampler specification

In [None]:
from azureml.train.hyperdrive.parameter_expressions import uniform, loguniform, choice, quniform

# Specify parameter sampler

# RandomParameterSampling
ps = RandomParameterSampling(
    {
        "learning_rate": uniform(0.01, 1),
        "C": loguniform(0.01, 10),
        "batch_size": choice(16, 32, 64, 128, 256),
        "hidden_size": choice(80, 120, 160, 240, 480), 
        "max_iter": quniform(100, 1000, 100),
        "solver": choice("lbfgs", "newton-cg", "sag", "saga")        
    }
)

# BayesianParameterSampling
ps_B = BayesianParameterSampling(
    {'--learning-rate': uniform(0.01,1), 
     '--batch_size': choice(16, 32, 64, 128), 
     '--max_iter': choice(100, 150, 200, 250, 300) 
    }
)

### Policy for early stopping

In [None]:
# Specify a Policy

# policy for RandomParameterSampling 
policy_R = BanditPolicy(slack_factor=0.1, evaluation_interval=1, delay_evaluation=5)

# policy for BayesianParameterSampling      
## Bayesian sampling does not support early termination policies, so it's better to set the policy to "None"
policy_B = None


### Set up the environment for the training run

In [None]:
# if "training" not in os.listdir():
#   os.mkdir("./training")
    
# Setup environment for the training run
sklearn_env = Environment.from_conda_specification(name='sklearn-env', file_path='conda_dependencies.yml')


### Import and register the dataset

In [None]:
from azureml.data.dataset_factory import TabularDatasetFactory
from azureml.core.dataset import Dataset
from azureml.exceptions import UserErrorException

# Create TabularDataset using TabularDatasetFactory
## Data is available at: 
## "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

datapath_url = 'https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv'
dataset = TabularDatasetFactory.from_delimited_files(path = datapath_url, random_state = 42)
ds_name = 'UCbankmarketing'

dataset_registered = False

try:
    temp = Dataset.get_by_name(workspace = ws, name = ds_name)
    dataset_registered = True
    
except UserErrorException:
    print("The Bank marketing dataset is not registered in workspace yet.")

if not dataset_registered:
    print('Registering the dataset')
    dataset = dataset.register(workspace = ws,
                               name = ds_name,
                               description = 'Udacity Bank marketing dataset',
                               create_new_version = True)

### Create an `estimator` for the _training_ script

In [None]:
## Create a Logistic Regression estimator

# from azureml.train.sklearn import LogisticRegression

# estimator_LR = LogisticRegression(
#     source_directory = ".",              # Specify the training script directory
#     compute_target = compute_cluster,    # Specify the compute target
#     entry_script = "train.py",           # Specify the training script
#     max_iter = 1000,                     # Set a large value for max iterations
#     vm_size = "Standard_DS3_v2", 
#     vm_priority = "dedicated"
# )

# NOTE: 
## WARNING:azureml.train.sklearn:'SKLearn','LogisticRegression' estimators are deprecated. 
## Use 'ScriptRunConfig' from 'azureml.core.script_run_config' with own defined environment 
## or the AzureML-Tutorial curated environment.
### script_run_config.script_run_config.target = cluster

In [None]:
from azureml.core import ScriptRunConfig

# Create a ScriptRunConfig Object to specify the configuration details of the training job

args = ['--dataset', dataset.as_mount()]  # for mounting the dataset

estimator_src = ScriptRunConfig(
    source_directory = '.',               # Specify the training script directory
    compute_target = compute_cluster,     # Specify the compute target
    entry_script = "train.py",            # Specify the training script
    environment = sklearn_env,            # Use the defined environment
    arguments = args,                     # Additional/Optional command line arguments
    max_iter = 1000,                      # Set a large value for max iterations
    vm_size = "Standard_DS3_v2", 
    vm_priority = "dedicated"
)

### Create a `HyperDriveConfig`

In [None]:
from azureml.train.hyperdrive.runconfig import HyperDriveConfig

# Create a HyperDriveConfig using the src object, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(
    estimator = estimator_src,
    hyperparameter_sampling = ps,
    policy = policy_R,                                # early_stopping_policy,
    primary_metric_name = "accuracy",                 # Specify your primary metric
    primary_metric_goal = PrimaryMetricGoal.MAXIMIZE,
    max_total_runs = 20,
    max_concurrent_runs = 4
)

### Submit the `HyperDriveConfig` to run the _experiment_

In [None]:
# Submit the hyperdrive run to the experiment and show run details with the widget.
hyperdrive_run = exp.submit(config = hyperdrive_config, show_output = True)

RunDetails(hyperdrive_run).show()

#hyperdrive_run.get_status()
hyperdrive_run.wait_for_completion(show_output = True)
 

### Select the best hyper-parameters for the model and save the best model 

In [None]:
import joblib

# Get the best run and save the model from that run.

best_run = hyperdrive_run.get_best_run_by_primary_metric()
print('Best Run', best_run)
print("Details :", hyperdrive_best_run.get_details())
print('\n')
best_run_metrics = best_run.get_metrics()
parameter_values = best_run.get_details()['runDefinition']['arguments']
print("Best Run file names: ", best_run.get_file_names())
print('\n')
print("Best Run metrics: ", best_run_metrics)
print('\n')
print('Best Run Id: ', best_run.id)
print('\n Accuracy: ', best_run_metrics['Accuracy'])
print('\n Learning Rate: ', parameter_values)

#Save the best model.
from azureml.core.model import Model

best_model = best_run.register_model( model_name = 'hyperdrive_best_model', 
                                      model_path = './outputs/model.pkl',      # model_path='outputs/model.joblib'
                                      model_framework = Model.Framework.SCIKITLEARN, 
                                      model_framework_version = sklearn.__version__,
                                      tags = {"Method" : "HyperDrive"},
                                      properties = {"Accuracy" : best_run_metrics["Accuracy"]}
                                    )

joblib.dump(parameter_values, filename = './outputs/best_model_parameters.joblib')

print("The best model has been saved successfully.")


## Part 2: AutoML modelling

### Create TabularDataset

In [None]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

datapath_url = 'https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv'
ds = TabularDatasetFactory.from_delimited_files(path = datapath_url, random_state = 42)
#ds_name = 'UCbankmarketing'

### Clean the data

In [None]:
from train import clean_data
if not (pandas in sys.modules):
    import pandas as pd

# Use the clean_data function to clean the dataset.
x, y = clean_data(ds)

### Split the data into _training_ and _testing_ sets

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, stratify = y, random_state = 42)
ds_train = pd.concat([x_train,y_train], axis=1)

### Configure the AutoML run settings

In [None]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE 'experiment_timeout_minutes' PARAMETER OR THE INSTANCE WILL TIME OUT.

automl_config = AutoMLConfig(
    experiment_timeout_minutes = 30,  # DO NOT CHANGE THE 'experiment_timeout_minutes' PARAMETER 
    enable_early_stopping = True,
    debug_log = 'automl_errors.log',
    task = 'classification',
    primary_metric = 'accuracy',
    training_data = ds_train,
    label_column_name = 'Subscribe',
    n_cross_validations = 6,
    compute_target = compute_cluster,
    enable_onnx_compatible_models = True
    )

### Submit the AutoML run

In [2]:
# Submit the AutoML run
from azureml.widgets import RunDetails

automl_run = exp.submit(config = automl_config, show_output = True)

#Launch the widget to view the progress and results
RunDetails(automl_run).show()

automl_run.wait_for_completion(show_output = True)

### Save the best AutoML model

In [None]:
# Retrieve and save the best AutoML model.

automl_run.get_metrics()
print(automl_run.get_portal_url())

#Retrieving the best model
best_run_AutoML, best_model_AutoML = automl_run.get_output()
best_run_metrics_AutoML = best_run_AutoML.get_metrics()

print("Best AutoML Run Id: ", best_run_AutoML.id)
print("Accuracy: ", best_run_metrics_AutoML['accuracy'])
print("Fitted model:", best_model_AutoML)
print("Estimator:", best_model_AutoML._final_estimator)
print("Other details: \n") 
best_run_AutoML.get_details() 
best_run_AutoML.get_tags()

#Saving the best AutoML model
joblib.dump(best_model_AutoML, filename = 'outputs/best_model_AutoML.joblib')

model_AutoML_saved = automl_run.register_model(model_name = best_run_AutoML.properties['model_name'], description = 'Best AutoML model')
print("The best AutoML model has been saved successfully!")


### Cleanup the compute cluster

In [None]:
# cluster cleanup

compute_cluster.delete()