## Hyperparameter Tuning - Azure Machine Learning Service

In [1]:
import azureml.core
from azureml.core import Workspace

# print core SDK version number
print("Azure ML SDK Version: ", azureml.core.VERSION)

Azure ML SDK Version:  1.0.23


### Connect to Workspace

In [2]:
# load workspace configuration from the config.json file in the current folder.
ws = Workspace.from_config()
print(ws.name, ws.location, ws.resource_group, ws.location, sep='\t')

Found the config file in: /Users/Rithin/Desktop/Projects/session-based-vehicle-recommendations/model/config.json
ML-Workspace	westeurope	ML-DEV	westeurope


### Creating an Experiment

In [14]:
experiment_name = 'vehicle-views-all-sessions-features'

from azureml.core import Experiment
exp = Experiment(workspace=ws, name=experiment_name)

### Create Compute Resource - Run Based

In [15]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
import os

# choose a name for your cluster
compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME", "gpucluster")
compute_min_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MIN_NODES", 0)
compute_max_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MAX_NODES", 10)

# This example uses CPU VM. For using GPU VM, set SKU to STANDARD_NC6
vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU", "STANDARD_NC6")


if compute_name in ws.compute_targets:
    compute_target = ws.compute_targets[compute_name]
    if compute_target and type(compute_target) is AmlCompute:
        print('found compute target. just use it. ' + compute_name)
else:
    print('creating a new compute target...')
    provisioning_config = AmlCompute.provisioning_configuration(vm_size = vm_size,
                                                                min_nodes = compute_min_nodes, 
                                                                max_nodes = compute_max_nodes)

    # create the cluster
    compute_target = ComputeTarget.create(ws, compute_name, provisioning_config)
    
    # can poll for a minimum number of nodes and for a specific timeout. 
    # if no min node count is provided it will use the scale settings for the cluster
    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)
    
     # For a more detailed view of current AmlCompute status, use get_status()
    print(compute_target.get_status().serialize())

found compute target. just use it. gpucluster


### Get and Upload data for training in Cloud

In [16]:
import os
data_folder = os.path.join(os.getcwd(), 'data/dataset_clicks_latest')
os.makedirs(data_folder, exist_ok=True)

ds = ws.get_default_datastore()
print(ds.datastore_type, ds.account_name, ds.container_name)

#ds.upload(src_dir=data_folder, target_path='dataset', overwrite=True, show_progress=True)

AzureBlob mlworkspace0237397596 azureml-blobstore-b63b2fa4-c464-45f5-b678-4f311b598992


### Train on Remote Cluster

#### Create Directory and Copy Training Scripts

In [17]:
import os
script_folder = os.path.join(os.getcwd(), "training_scripts")
os.makedirs(script_folder, exist_ok=True)

In [18]:
import shutil
shutil.copy('main.py', script_folder)
shutil.copy('model.py', script_folder)
shutil.copy('utils.py', script_folder)

'/Users/Rithin/Desktop/Projects/session-based-vehicle-recommendations/model/training_scripts/utils.py'

#### Create an estimator

In [20]:
from azureml.train.dnn import PyTorch

script_params = {
    '--dataset_folder': ds.path('dataset').as_mount(),
    '--top_k':20
}

pt_est = PyTorch(source_directory=script_folder,
                 script_params=script_params,
                 compute_target=compute_target,
                 entry_script='main.py',
                 use_gpu=True)

#### Setup Hyperparameters

In [25]:
from azureml.train.hyperdrive import RandomParameterSampling, BanditPolicy, HyperDriveRunConfig, uniform, choice, PrimaryMetricGoal

param_sampling = RandomParameterSampling( {
        'lr': uniform(0.0001, 0.005),
        'batchSize': choice(16, 32, 64, 100, 128, 256),
        'hiddenSize': choice(10, 25, 50, 100, 125, 150, 200),
        'step': choice(1, 2, 4, 8, 10),
        'epoch': choice(range(3, 6)),
        'use_features': choice('True', 'False'),
        'nonhybrid': choice('True', 'False')
    }
)

early_termination_policy = BanditPolicy(slack_factor=0.05, evaluation_interval=1, delay_evaluation=3)

hyperdrive_run_config = HyperDriveRunConfig(estimator=pt_est,
                                            hyperparameter_sampling=param_sampling, 
                                            policy=early_termination_policy,
                                            primary_metric_name='Recall@20',
                                            primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                            max_total_runs=100,
                                            max_concurrent_runs=10)

#### Submit the training job to the cluster

In [26]:
from azureml.core import ScriptRunConfig

tags = {
    "from":"2019-03-15",
    "till":"2019-04-09",
    "training_sessions_sequences":"286667",
    "test_sessions_sequences":"40228",
    "test_days":"3"
}

run = exp.submit(config=hyperdrive_run_config, tags=tags)

In [27]:
run
#run.cancel()

Experiment,Id,Type,Status,Details Page,Docs Page
vehicle-views-all-sessions-features,vehicle-views-all-sessions-features_1554947984997,hyperdrive,Running,Link to Azure Portal,Link to Documentation


### Get the Best Model

In [None]:
best_run = run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()

In [None]:
parameter_values = best_run.get_details()['runDefinition']['Arguments']

print(parameter_values)

### Register Best Model

In [None]:
output_folder = "outputs"
model_name = "vehicle_recommendations_model"

model_gnn = run.register_model(model_name=model_name, 
                           model_path=f'{output_folder}/{model_name}.pt',
                           tags=tags)
