## Hyperparameter Tuning in SageMaker

In [2]:
import sagemaker
from sagemaker.tuner import (
    IntegerParameter,
    CategoricalParameter,
    ContinuousParameter,
    HyperparameterTuner,
)

sagemaker_session = sagemaker.Session()

bucket = sagemaker_session.default_bucket()
prefix = "sagemaker/DEMO-pytorch-cifar"

role = sagemaker.get_execution_role()

In [3]:
from torchvision.datasets import CIFAR10
from torchvision import transforms


local_dir = 'data'
CIFAR10.mirrors = ["https://sagemaker-sample-files.s3.amazonaws.com/datasets/image/CIFAR10/"]
CIFAR10(
    local_dir,
    download=True,
    transform=transforms.Compose(
        [transforms.ToTensor()]
    )
)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to data/cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]

Extracting data/cifar-10-python.tar.gz to data


Dataset CIFAR10
    Number of datapoints: 50000
    Root location: data
    Split: Train
    StandardTransform
Transform: Compose(
               ToTensor()
           )

In [4]:
# TODO: Upload the data to an S3 bucket. You can use the sagemaker_session object, boto3 or the AWS CLI
inputs = sagemaker_session.upload_data(path="data", bucket=bucket, key_prefix=prefix)

print("input spec (in this case, just an S3 path): {}".format(inputs))

input spec (in this case, just an S3 path): s3://sagemaker-us-east-1-755391689112/sagemaker/DEMO-pytorch-cifar


In [10]:
from sagemaker.pytorch import PyTorch

estimator = PyTorch(
    entry_point="scripts/cifar.py",
    role=role,
    py_version='py36',
    framework_version="1.8",
    instance_count=1,
    instance_type="ml.m5.large"
)

In [11]:
hyperparameter_ranges = {
    "lr": ContinuousParameter(0.001, 0.1),
    "batch-size": CategoricalParameter([32, 64, 128, 256, 512]),
}

In [12]:
objective_metric_name = "average test loss"
objective_type = "Minimize"
metric_definitions = [{"Name": "average test loss", "Regex": "Test set: Average loss: ([0-9\\.]+)"}]

In [13]:
tuner =  HyperparameterTuner(
    estimator,
    objective_metric_name,
    hyperparameter_ranges,
    metric_definitions,
    max_jobs=4,
    max_parallel_jobs=2,
    objective_type=objective_type,
)

In [15]:
tuner.fit({"training": inputs}, wait=True)

.......................................................................................................................................................!


In [16]:
tuner.best_estimator()
tuner.best_estimator().hyperparameters()


2021-12-20 23:54:00 Starting - Preparing the instances for training
2021-12-20 23:54:00 Downloading - Downloading input data
2021-12-20 23:54:00 Training - Training image download completed. Training in progress.
2021-12-20 23:54:00 Uploading - Uploading generated training model
2021-12-20 23:54:00 Completed - Training job completed

2021-12-20 23:54:00 Starting - Preparing the instances for training
2021-12-20 23:54:00 Downloading - Downloading input data
2021-12-20 23:54:00 Training - Training image download completed. Training in progress.
2021-12-20 23:54:00 Uploading - Uploading generated training model
2021-12-20 23:54:00 Completed - Training job completed


{'_tuning_objective_metric': '"average test loss"',
 'batch-size': '"64"',
 'lr': '0.08367322979687022',
 'sagemaker_container_log_level': '20',
 'sagemaker_estimator_class_name': '"PyTorch"',
 'sagemaker_estimator_module': '"sagemaker.pytorch.estimator"',
 'sagemaker_job_name': '"pytorch-training-2021-12-20-23-42-04-368"',
 'sagemaker_program': '"cifar.py"',
 'sagemaker_region': '"us-east-1"',
 'sagemaker_submit_directory': '"s3://sagemaker-us-east-1-755391689112/pytorch-training-2021-12-20-23-42-04-368/source/sourcedir.tar.gz"'}

In [17]:
predictor = tuner.deploy(initial_instance_count=1, instance_type="ml.t2.medium")


2021-12-20 23:54:00 Starting - Preparing the instances for training
2021-12-20 23:54:00 Downloading - Downloading input data
2021-12-20 23:54:00 Training - Training image download completed. Training in progress.
2021-12-20 23:54:00 Uploading - Uploading generated training model
2021-12-20 23:54:00 Completed - Training job completed
-------------------------!

In [None]:
from sagemaker.pytorch import PyTorchModel

pytorch_model = PyTorchModel(
    model_data='s3://sagemaker-us-east-1-755391689112/pytorch-training-2021-12-20-23-55-15-841/model.tar.gz',
    role=role, 
    entry_point='scripts/cifar.py',
    py_version='py36',
    framework_version="1.8",
)
predictor = pytorch_model.deploy(instance_type='ml.t2.medium', initial_instance_count=1)

----

## Query the Endpoint

In [16]:
import gzip 
import numpy as np
import random
import os
import torch

file = 'data/cifar-10-batches-py/data_batch_1'
def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

data=unpickle(file)
data.keys()

dict_keys([b'batch_label', b'labels', b'data', b'filenames'])

In [17]:
data=np.reshape(data[b'data'][0], (1, 3, 32, 32))
data.shape
                            

(1, 3, 32, 32)

In [20]:
response = predictor.predict(data) # TODO: Query the endpoint
print(response)

ParamValidationError: Parameter validation failed:
Invalid type for parameter Body, value: [[[[ 59  43  50 ... 158 152 148]
   [ 16   0  18 ... 123 119 122]
   [ 25  16  49 ... 118 120 109]
   ...
   [208 201 198 ... 160  56  53]
   [180 173 186 ... 184  97  83]
   [177 168 179 ... 216 151 123]]

  [[ 62  46  48 ... 132 125 124]
   [ 20   0   8 ...  88  83  87]
   [ 24   7  27 ...  84  84  73]
   ...
   [170 153 161 ... 133  31  34]
   [139 123 144 ... 148  62  53]
   [144 129 142 ... 184 118  92]]

  [[ 63  45  43 ... 108 102 103]
   [ 20   0   0 ...  55  50  57]
   [ 21   0   8 ...  50  50  42]
   ...
   [ 96  34  26 ...  70   7  20]
   [ 96  42  30 ...  94  34  34]
   [116  94  87 ... 140  84  72]]]], type: <class 'numpy.ndarray'>, valid types: <class 'bytes'>, <class 'bytearray'>, file-like object

In [24]:
response = predictor.predict(data) # TODO: Query the endpoint
print(response)

ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received server error (500) from primary with message "Expected 4-dimensional input for 4-dimensional weight [6, 3, 5, 5], but got 3-dimensional input of size [3, 32, 32] instead
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/site-packages/sagemaker_inference/transformer.py", line 126, in transform
    result = self._transform_fn(self._model, input_data, content_type, accept)
  File "/opt/conda/lib/python3.6/site-packages/sagemaker_inference/transformer.py", line 216, in _default_transform_fn
    prediction = self._predict_fn(data, model)
  File "/opt/conda/lib/python3.6/site-packages/sagemaker_pytorch_serving_container/default_pytorch_inference_handler.py", line 125, in default_predict_fn
    output = model(input_data)
  File "/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/opt/ml/model/code/cifar.py", line 33, in forward
    x = self.pool(F.relu(self.conv1(x)))
  File "/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/opt/conda/lib/python3.6/site-packages/torch/nn/modules/conv.py", line 399, in forward
    return self._conv_forward(input, self.weight, self.bias)
  File "/opt/conda/lib/python3.6/site-packages/torch/nn/modules/conv.py", line 396, in _conv_forward
    self.padding, self.dilation, self.groups)
RuntimeError: Expected 4-dimensional input for 4-dimensional weight [6, 3, 5, 5], but got 3-dimensional input of size [3, 32, 32] instead
". See https://us-east-1.console.aws.amazon.com/cloudwatch/home?region=us-east-1#logEventViewer:group=/aws/sagemaker/Endpoints/pytorch-training-211220-2342-004-186c2f06 in account 755391689112 for more information.

### Cleanup

After you have finished with this exercise, remember to delete the prediction endpoint to release the instance associated with it

tuner.delete_endpoint()