# Hyperparameter Tuning using HyperDrive

Setup all imports for the notebook

In [1]:
# Imports for Azure ML Environment
import azureml.core
from azureml.core import Workspace, Experiment
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core import Environment, ScriptRunConfig
from azureml.core.dataset import Dataset
from azureml.widgets import RunDetails

# Imports for model training and hyperparameter optimization
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling, GridParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import choice, uniform, randint

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

import pandas as pd

# Other imports
import os

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.50.0


## Initial Setup

* Workspace
* Experiment
* Compute Cluster

In [2]:
ws = Workspace.from_config()
experiment_name = 'ud3-hyper-tune'

experiment=Experiment(ws, experiment_name)


# 
# This provisioning uses the STANDARD_D2_V2 vm size for cost management purposes.
# We could have selected a larger vm for the cluster for more compute to conduct more concurrent experiments
# 

cluster_name = "mom-health-hcluster"
compute_config = AmlCompute.provisioning_configuration(vm_size="STANDARD_D2_V2", min_nodes=0, max_nodes=4)

try:
    my_compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target.')
except ComputeTargetException:
    my_compute_target = ComputeTarget.create(ws, cluster_name, compute_config)
    my_compute_target.wait_for_completion(show_output=True)


Found existing compute target.


## Dataset

### Overview

In this exercise, I'll be using the Maternal Health Risk data set from the [UCI Machine Learning repository](https://archive.ics.uci.edu/ml/datasets/Maternal+Health+Risk+Data+Set). This data set contains 1,014 records with 7 attributes including:

* Age
* Systolic Blood Pressure as SystolicBP
* Diastolic BP as DiastolicBP
* Blood Sugar as BS
* Body Temperature as BodyTemp, 
* HeartRate
* RiskLevel
 
RiskLevel is one of 

* "low risk"
* "mid risk" 
* "high risk"

The ML model will be trained to predict the *RiskLevel* column based on the other parameters.

In [3]:
# Try to load the dataset from the Workspace. Otherwise, create it from the file
# NOTE: update the key to match the dataset name
data_file = path='https://archive.ics.uci.edu/ml/machine-learning-databases/00639/Maternal%20Health%20Risk%20Data%20Set.csv'

df = pd.read_csv(data_file).dropna()
df.describe()

Unnamed: 0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate
count,1014.0,1014.0,1014.0,1014.0,1014.0,1014.0
mean,29.871795,113.198225,76.460552,8.725986,98.665089,74.301775
std,13.474386,18.403913,13.885796,3.293532,1.371384,8.088702
min,10.0,70.0,49.0,6.0,98.0,7.0
25%,19.0,100.0,65.0,6.9,98.0,70.0
50%,26.0,120.0,80.0,7.5,98.0,76.0
75%,39.0,120.0,90.0,8.0,98.0,80.0
max,70.0,160.0,100.0,19.0,103.0,90.0


## Feature Engineering

Do some EDA and feature engineering to enhance the performance of the ML algorithm.

Register the data sets such that the training script can access them for training.

In [4]:
# Dict for mapping categorial data to numeric
risks = {"low risk":1, "mid risk":2, "high risk":3}
df["RiskLevel"] = df["RiskLevel"].map(risks)

print(df.head(5))
x = df.copy()
y = x.pop("RiskLevel")

# Splitting the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Rescale the data to fit with logistic classification algorithms
scaler = StandardScaler().fit(x_train)
x_train = pd.DataFrame(scaler.transform(x_train))

scaler = StandardScaler().fit(x_test)
x_test = pd.DataFrame(scaler.transform(x_test))

# Print the rescaled dataframe
x_train.describe()


   Age  SystolicBP  DiastolicBP    BS  BodyTemp  HeartRate  RiskLevel
0   25         130           80  15.0      98.0         86          3
1   35         140           90  13.0      98.0         70          3
2   29          90           70   8.0     100.0         80          3
3   30         140           85   7.0      98.0         70          3
4   35         120           60   6.1      98.0         76          1


Unnamed: 0,0,1,2,3,4,5
count,811.0,811.0,811.0,811.0,811.0,811.0
mean,-1.032193e-16,-2.491499e-16,2.030161e-16,-2.79267e-16,4.744526e-15,-1.045882e-16
std,1.000617,1.000617,1.000617,1.000617,1.000617,1.000617
min,-1.480608,-2.344268,-1.988439,-0.8345775,-0.5003496,-8.158575
25%,-0.8071931,-0.7219441,-0.8278605,-0.5636009,-0.5003496,-0.5310165
50%,-0.2086025,0.3596051,0.2601822,-0.3829498,-0.5003496,0.1954177
75%,0.5770476,0.3596051,0.985544,-0.2324073,-0.5003496,0.6797071
max,3.008822,2.522703,1.710906,3.079529,3.088115,1.890431


In [5]:
# Register the datasets
Dataset.Tabular.register_pandas_dataframe(x_train, target=ws.datastores['workspaceblobstore'], name='x_train')
Dataset.Tabular.register_pandas_dataframe(x_test, target=ws.datastores['workspaceblobstore'], name='x_test')
Dataset.Tabular.register_pandas_dataframe(pd.DataFrame(y_train), target=ws.datastores['workspaceblobstore'], name='y_train')
Dataset.Tabular.register_pandas_dataframe(pd.DataFrame(y_test), target=ws.datastores['workspaceblobstore'], name='y_test')


Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to managed-dataset/68805e54-9ca5-4815-9162-a3a7a7023250/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.
Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to managed-dataset/45f62b3b-d241-4b29-936a-54779614e848/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.
Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to managed-dataset/98d07863-e8a2-48d4-9643-9ad3dbb2c7ab/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.
Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploadin

{
  "source": [
    "('workspaceblobstore', 'managed-dataset/9d76ebd8-981d-43d7-ad2f-6634499bde3f/')"
  ],
  "definition": [
    "GetDatastoreFiles",
    "ReadParquetFile",
    "DropColumns"
  ],
  "registration": {
    "id": "04a4df13-660c-4ca4-89aa-26f1c89cbc34",
    "name": "y_test",
    "version": 9,
    "workspace": "Workspace.create(name='ud3', subscription_id='35976fdd-d799-46b2-9fdd-859ceba2a824', resource_group='aml-rg')"
  }
}

## Hyperdrive Configuration

For this project, I selected two different classification algorithms - SGDClassifier and LogisticRegression. I selected these for their ability to preform relatively well on general classification problems.

Furthermore, I experimented with different paramater sampling models, trying both Random and Grid sampling. Overall, sampling models did not significantly impact the algorithm performance, so I standardized on the Random sampling.

For the SGDClassifier model I varied the following hyperparameters:

* alpha
* max_iter
* learning_rate
* eta0

Performance tended to be best with:

For the LogisticRegression model I varied the following hyperparameters:

* solve
* penalty
* multi_class
* max_iter
* C

Performance tended to be best with: 

### Results 

Neither SGDClassifier nor LogisticRegression showed significant performance differences over the other. Both produced results around 66% accuracy with all the same feature engineering done on the data.

In [6]:
# Flag to select which algorithm to use - can be one of 'LR' or SGD'
algo = 'SGD'
train_script = 'train.py'

# TODO: Create an early termination policy. This is not required if you are using Bayesian sampling.
early_termination_policy = BanditPolicy(slack_factor=0.2, evaluation_interval=1)

if algo == 'SGD':
    param_sampling = RandomParameterSampling({
        'loss': choice(['log_loss']), # Can only use log for multi-class
        'penalty': choice(['l2']), # Need to have set to l2 for multi-class
        'alpha': uniform(0.0001, 0.0010),
        'max_iter': randint(10000),
        'learning_rate': choice(['constant', 'optimal', 'invscaling', 'adaptive']),
        'eta0': uniform(0.001, 0.01)
    })
    train_script = 'train.py'
elif algo == 'LR':
#    param_sampling = GridParameterSampling({
#        'solver': choice(['lbfgs', 'sag', 'saga', 'newton-cg']),
#        'penalty': choice(['l2', 'none']),
#        'multi_class': choice(['ovr', 'multinomial']),
#        'C': choice([1, 10, 100, 1000, 10000])
#    })
    param_sampling = RandomParameterSampling({
        'solver': choice(['lbfgs', 'sag', 'saga', 'newton-cg']),
        'penalty': choice(['l2', 'none']),
        'multi_class': choice(['ovr', 'multinomial']),
        'max_iter': randint(10000),
        'C': uniform(1, 100)
    })
    train_script = 'train_lr.py'
else:
    print('algorithm unknown for training')
    raise Exception

if "training" not in os.listdir():
    os.mkdir("./training")

# Setup environment for your training run
sklearn_env = Environment.from_conda_specification(name='sklearn-env', file_path='env.yaml')

# Create a ScriptRunConfig Object to specify the configuration details of your training job
estimator = ScriptRunConfig(source_directory='.', script=train_script, environment=sklearn_env, compute_target=my_compute_target)

# Create a HyperDriveConfig using the src object, hyperparameter sampler, and policy.
hyperdrive_run_config = HyperDriveConfig(run_config=estimator,
                                     hyperparameter_sampling=param_sampling,
                                     primary_metric_name='Accuracy',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs=200,
                                     max_concurrent_runs=10,
                                     policy=early_termination_policy)

In [7]:
# Submit the HyperDriveConfig object to run the experiment
hyperdrive_run = experiment.submit(config=hyperdrive_run_config)

## Run Details

Use the `RunDetails` widget to show the different experiments.

In [8]:
# Use the RunDetails widget to display the run details
print(hyperdrive_run)
RunDetails(hyperdrive_run).show()
hyperdrive_run.wait_for_completion()

Run(Experiment: ud3-hyper-tune,
Id: HD_e1a10dc3-a517-4768-8035-5b2bd411e82d,
Type: hyperdrive,
Status: Running)


_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

{'runId': 'HD_e1a10dc3-a517-4768-8035-5b2bd411e82d',
 'target': 'mom-health-hcluster',
 'status': 'Completed',
 'startTimeUtc': '2023-05-01T21:06:06.536469Z',
 'endTimeUtc': '2023-05-01T21:39:47.553318Z',
 'services': {},
 'properties': {'primary_metric_config': '{"name":"Accuracy","goal":"maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': '447dcc71-7abb-4f5f-8442-751a3ccc20f6',
  'user_agent': 'python/3.8.5 (Linux-5.15.0-1035-azure-x86_64-with-glibc2.10) msrest/0.7.1 Hyperdrive.Service/1.0.0 Hyperdrive.SDK/core.1.50.0',
  'space_size': 'infinite_space_size',
  'score': '0.6798029556650246',
  'best_child_run_id': 'HD_e1a10dc3-a517-4768-8035-5b2bd411e82d_167',
  'best_metric_status': 'Succeeded',
  'best_data_container_id': 'dcid.HD_e1a10dc3-a517-4768-8035-5b2bd411e82d_167'},
 'inputDatasets': [],
 'outputDatasets': [],
 'runDefinition': {'configuration': None,
  'attribution': None,
  't

## Best Model

Get the best model from the hyperdrive experiments and display all the properties of the model.

Register the model.

In [9]:
best_run = hyperdrive_run.get_best_run_by_primary_metric()
print(best_run.get_details())
print(best_run.get_file_names())

{'runId': 'HD_e1a10dc3-a517-4768-8035-5b2bd411e82d_167', 'target': 'mom-health-hcluster', 'status': 'Completed', 'startTimeUtc': '2023-05-01T21:35:11.640423Z', 'endTimeUtc': '2023-05-01T21:35:32.160323Z', 'services': {}, 'properties': {'_azureml.ComputeTargetType': 'amlctrain', 'ContentSnapshotId': '447dcc71-7abb-4f5f-8442-751a3ccc20f6', 'ProcessInfoFile': 'azureml-logs/process_info.json', 'ProcessStatusFile': 'azureml-logs/process_status.json'}, 'inputDatasets': [{'dataset': {'id': '22063e2c-656b-43fd-be60-566ae43b64af'}, 'consumptionDetails': {'type': 'Reference'}}, {'dataset': {'id': '21da7920-cc82-4b3e-82ea-6b8d11449ca1'}, 'consumptionDetails': {'type': 'Reference'}}, {'dataset': {'id': '9f99953c-2317-4803-9e94-85411f5e74f8'}, 'consumptionDetails': {'type': 'Reference'}}, {'dataset': {'id': '04a4df13-660c-4ca4-89aa-26f1c89cbc34'}, 'consumptionDetails': {'type': 'Reference'}}], 'outputDatasets': [], 'runDefinition': {'script': 'train.py', 'command': '', 'useAbsolutePath': False, 'ar

In [10]:
#TODO: Save the best model
best_run.register_model(model_name='hyper-momhealth', model_path='outputs/model.joblib')

Model(workspace=Workspace.create(name='ud3', subscription_id='35976fdd-d799-46b2-9fdd-859ceba2a824', resource_group='aml-rg'), name=hyper-momhealth, id=hyper-momhealth:3, version=3, tags={}, properties={})

## Clean up after ourselves

Deprovision the compute cluster used for training.

In [None]:
try:
    my_compute_target.delete()
    my_compute_target.wait_for_completion(show_output=True)
except ComputeTargetException:
    print('ComputeTarget not found')

InProgress

## Model Deployment

I have deployed the model in the AutoML exercise, hence not doing the deployment of this model as only one is needed.