# Hyperparameter Tuning using HyperDrive

TODO: Import Dependencies. In the cell below, import all the dependencies that you will need to complete the project.

In [4]:
!pip install catboost

Collecting catboost
  Downloading catboost-0.26-cp36-none-manylinux1_x86_64.whl (69.2 MB)
[K     |████████████████████████████████| 69.2 MB 87.5 MB/s eta 0:00:01
Collecting plotly
  Using cached plotly-5.1.0-py2.py3-none-any.whl (20.6 MB)
Installing collected packages: plotly, catboost
Successfully installed catboost-0.26 plotly-5.1.0


In [8]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.widgets import RunDetails
from azureml.core import ScriptRunConfig
from azureml.core.runconfig import DockerConfiguration


from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import MedianStoppingPolicy
from azureml.train.hyperdrive.sampling import BayesianParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, choice

from azureml.core import Dataset, Workspace, Experiment, Environment

from azureml.core.model import InferenceConfig

import joblib

In [13]:
ws = Workspace.from_config()
experiment_name = 'mushroom'

experiment=Experiment(ws, experiment_name)

## Dataset

In [14]:
dataset = Dataset.Tabular.from_delimited_files('https://raw.githubusercontent.com/sannif/udacity_capstone_project/main/dataset/mushrooms.csv')
dataset.take(5).to_pandas_dataframe()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,True,p,False,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,True,a,False,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,True,l,False,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,True,p,False,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,False,n,False,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [10]:
# register the dataset
dataset = dataset.register(ws, name='Mushrooms')

{
  "source": [
    "https://raw.githubusercontent.com/sannif/udacity_capstone_project/main/dataset/mushrooms.csv"
  ],
  "definition": [
    "GetFiles",
    "ParseDelimited",
    "DropColumns",
    "SetColumnTypes"
  ],
  "registration": {
    "id": "8a8383b4-4167-4e72-a91d-1e913500fe42",
    "name": "Mushrooms",
    "version": 1,
    "workspace": "Workspace.create(name='quick-starts-ws-149420', subscription_id='d4ad7261-832d-46b2-b093-22156001df5b', resource_group='aml-quickstarts-149420')"
  }
}

## Compute

In [4]:
compute_name = "cluster1"
vm_size = "Standard_DS12_v2"
min_nodes, max_nodes = 1, 6
if compute_name in ws.compute_targets:
    compute_target = ws.compute_targets[compute_name]
    if compute_target and type(compute_target) is AmlCompute:
        print("found compute target: " + compute_name)
else:
    print("creating new compute target...")
    provisioning_config = AmlCompute.provisioning_configuration(vm_size = vm_size, min_nodes = min_nodes, max_nodes = max_nodes)
    compute_target = ComputeTarget.create(ws, compute_name, provisioning_config)
compute_target.wait_for_completion(show_output=True)

found compute target: cluster1
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## Hyperdrive Configuration

TODO: Explain the model you are using and the reason for chosing the different hyperparameters, termination policy and config settings.

In [18]:
early_termination_policy = MedianStoppingPolicy(delay_evaluation=5)

param_sampling = BayesianParameterSampling({
    'iterations': choice([100, 250, 500]),
    'learning_rate': uniform(0.01, 1),
    'l2_leaf_reg':uniform(1, 10000),
    'depth': choice(range(1, 17))
})


catboost_env = Environment.from_conda_specification(name='catboost-env', file_path='conda_dependencies.yml')
docker_config = DockerConfiguration(use_docker=True)


estimator = ScriptRunConfig(source_directory = '.',
                            script = 'train.py',
                            arguments=['--iterations', 100, '--learning_rate', 0.1, '--l2_leaf_reg', 10, '--depth', 6],
                            compute_target = compute_target,
                            environment = catboost_env,
                            docker_runtime_config=docker_config)

hyperdrive_run_config = HyperDriveConfig(run_config = estimator,
                                         hyperparameter_sampling = param_sampling,
                                         primary_metric_name='accuracy',
                                         primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                         max_total_runs=25,
                                         max_concurrent_runs=6)

For best results with Bayesian Sampling we recommend using a maximum number of runs greater than or equal to 20 times the number of hyperparameters being tuned. Recommendend value:80.


In [19]:
hyperdrive_run = experiment.submit(hyperdrive_run_config)

## Run Details

OPTIONAL: Write about the different models trained and their performance. Why do you think some models did better than others?

TODO: In the cell below, use the `RunDetails` widget to show the different experiments.

In [20]:
RunDetails(hyperdrive_run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

## Best Model

TODO: In the cell below, get the best model from the hyperdrive experiments and display all the properties of the model.

In [21]:
best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run

Experiment,Id,Type,Status,Details Page,Docs Page
mushroom,HD_e3e97d4e-0f1d-44a2-ab05-754c7ca3dd0e_4,azureml.scriptrun,Completed,Link to Azure Machine Learning studio,Link to Documentation


In [24]:
os.makedirs('models', exist_ok=True)
best_run.download_file('outputs/model.pkl', 'models/catboost.pkl')

## Model Deployment

Remember you have to deploy only one of the two models you trained.. Perform the steps in the rest of this notebook only if you wish to deploy this model.

TODO: In the cell below, register the model, create an inference config and deploy the model as a web service.

### Convert the model to ONNX

In [6]:
from azureml.automl.runtime.onnx_convert import OnnxConverter

In [9]:
with open('models/catboost.pkl', 'rb') as f:
    model = joblib.load(f)

In [10]:
dir(model)

['__class__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_base_calc_leaf_indexes',
 '_base_drop_unused_features',
 '_base_eval_metrics',
 '_base_predict',
 '_base_shrink',
 '_base_virtual_ensembles_predict',
 '_calc_fstr',
 '_calc_leaf_indexes',
 '_calc_ostr',
 '_check_is_compatible_loss',
 '_convert_to_asymmetric_representation',
 '_deserialize_model',
 '_estimator_type',
 '_eval_metrics',
 '_fit',
 '_get_borders',
 '_get_cat_feature_indices',
 '_get_embedding_feature_indices',
 '_get_float_feature_indices',
 '_get_nan_treatments',
 '_get_params',
 '_get_tags',
 '_get_text_feature_indices',
 '_get_tree_leaf_value

In [15]:
df = dataset.to_pandas_dataframe()
df_clean = df.drop(['gill-attachment', 'veil-type'], axis=1)
df_clean['bruises'] = df_clean['bruises'].replace({True: 1 , False: 0})
df_clean['class'] = df_clean['class'].replace({"p": 1 , "e": 0})
X = df_clean.drop('class', axis=1)

In [18]:
onnx_catboost = OnnxConverter()
onnx_catboost.initialize_input(X)

onnx_catboost.convert(model)

(None,
 None,
 None,
 {'Info': 'Failed to convert Model.',
  'Class': 'OnnxConvertException',
  'ErrorMsg': '[Scrubbed]',
  'RawModelString': '[Scrubbed]',
  'IsOnnxCompatibleMode': False,
  'StackTrace': '  File "onnx_converter.py", line 487, in convert\n    self._validate_raw_model(raw_model)\n  File "onnx_converter.py", line 679, in _validate_raw_model\n    raise OnnxConvertException(\'Invalid raw model type.\')\n'})

In [17]:
type(model)

catboost.core.CatBoostClassifier

In [None]:
env = Environment(name="project_environment")
dummy_inference_config = InferenceConfig(
    environment=catboost_env,
    source_directory=".",
    entry_script="score.py",
)

TODO: In the cell below, send a request to the web service you deployed to test it.

TODO: In the cell below, print the logs of the web service and delete the service