## Hyperparameter Tuning in SageMaker

In [3]:
!pip install torchvision
import sagemaker
from sagemaker.tuner import (
    IntegerParameter,
    CategoricalParameter,
    ContinuousParameter,
    HyperparameterTuner,
)

sagemaker_session = sagemaker.Session()

bucket = sagemaker_session.default_bucket()
prefix = "sagemaker/DEMO-pytorch-cifar"

role = sagemaker.get_execution_role()

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes


In [4]:
from torchvision.datasets import CIFAR10
from torchvision import transforms


local_dir = 'data'
CIFAR10.mirrors = ["https://sagemaker-sample-files.s3.amazonaws.com/datasets/image/CIFAR10/"]
CIFAR10(
    local_dir,
    download=True,
    transform=transforms.Compose(
        [transforms.ToTensor()]
    )
)

Files already downloaded and verified


Dataset CIFAR10
    Number of datapoints: 50000
    Root location: data
    Split: Train
    StandardTransform
Transform: Compose(
               ToTensor()
           )

In [5]:
# TODO: Upload the data to an S3 bucket. You can use the sagemaker_session object, boto3 or the AWS CLI
inputs = sagemaker_session.upload_data(path="data", bucket=bucket, key_prefix=prefix)
print("input spec (in this case, just an S3 path): {}".format(inputs))

input spec (in this case, just an S3 path): s3://sagemaker-us-east-1-264082167679/sagemaker/DEMO-pytorch-cifar


In [6]:
from sagemaker.pytorch import PyTorch

estimator = PyTorch(
    entry_point="cifar.py",
    role=role,
    py_version='py36',
    framework_version="1.8",
    instance_count=1,
    instance_type="ml.m5.large"
)

In [7]:
hyperparameter_ranges = {
    "lr": ContinuousParameter(0.001, 0.1),
    "batch-size": CategoricalParameter([32, 64, 128, 256, 512]),
    "epochs": IntegerParameter(2, 4)
}#TODO: Initialise your hyperparameters

In [8]:
objective_metric_name = "average test loss"
objective_type = "Minimize"
metric_definitions = [{"Name": "average test loss", "Regex": "Test set: Average loss: ([0-9\\.]+)"}]

In [9]:
tuner = HyperparameterTuner(
    estimator,
    objective_metric_name,
    hyperparameter_ranges,
    metric_definitions,
    max_jobs=4,
    max_parallel_jobs=2,
    objective_type=objective_type,
)#TODO: Create your HyperparameterTuner Object

In [10]:
tuner.fit({"training": inputs})#TODO: Train your model

.......................................................................................................................................................!


In [11]:
predictor = tuner.deploy(initial_instance_count=1, instance_type="ml.t2.medium")


2022-01-20 16:45:47 Starting - Preparing the instances for training
2022-01-20 16:45:47 Downloading - Downloading input data
2022-01-20 16:45:47 Training - Training image download completed. Training in progress.
2022-01-20 16:45:47 Uploading - Uploading generated training model
2022-01-20 16:45:47 Completed - Training job completed
-----------!

## Query the Endpoint

In [12]:
import gzip 
import numpy as np
import random
import os

file = 'data/cifar-10-batches-py/data_batch_1'
def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

data=unpickle(file)
data=np.reshape(data[b'data'][0], (3, 32, 32))

In [20]:
# response = predictor.predict(np.expand_dims(data, axis=1))
# print("Raw prediction result:")
# print(response)
# print()

response = predictor.predict(np.expand_dims(data, axis=1))
print("Raw prediction result:")
print(response)
print()

labeled_predictions = list(zip(range(10), response[0]))
print("Labeled predictions: ")
print(labeled_predictions)
print()

labeled_predictions.sort(key=lambda label_and_prob: 1.0 - label_and_prob[1])
print("Most likely answer: {}".format(labeled_predictions[0])) # TODO: Query the endpoint
print(response)
# labeled_predictions = list(zip(range(10), response[0]))
# print("Labeled predictions: ")
# print(labeled_predictions)
# print()

# labeled_predictions.sort(key=lambda label_and_prob: 1.0 - label_and_prob[1])
# print("Most likely answer: {}".format(labeled_predictions[0]))# TODO: Query the endpoint
# print(response)

ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received server error (500) from primary with message "Given groups=1, weight of size [6, 3, 5, 5], expected input[3, 1, 32, 32] to have 3 channels, but got 1 channels instead
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/site-packages/sagemaker_inference/transformer.py", line 126, in transform
    result = self._transform_fn(self._model, input_data, content_type, accept)
  File "/opt/conda/lib/python3.6/site-packages/sagemaker_inference/transformer.py", line 216, in _default_transform_fn
    prediction = self._predict_fn(data, model)
  File "/opt/conda/lib/python3.6/site-packages/sagemaker_pytorch_serving_container/default_pytorch_inference_handler.py", line 125, in default_predict_fn
    output = model(input_data)
  File "/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/opt/ml/model/code/cifar.py", line 33, in forward
    x = self.pool(F.relu(self.conv1(x)))
  File "/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/opt/conda/lib/python3.6/site-packages/torch/nn/modules/conv.py", line 399, in forward
    return self._conv_forward(input, self.weight, self.bias)
  File "/opt/conda/lib/python3.6/site-packages/torch/nn/modules/conv.py", line 396, in _conv_forward
    self.padding, self.dilation, self.groups)
RuntimeError: Given groups=1, weight of size [6, 3, 5, 5], expected input[3, 1, 32, 32] to have 3 channels, but got 1 channels instead
". See https://us-east-1.console.aws.amazon.com/cloudwatch/home?region=us-east-1#logEventViewer:group=/aws/sagemaker/Endpoints/pytorch-training-220120-1633-004-c85a5dfe in account 264082167679 for more information.

### Cleanup

After you have finished with this exercise, remember to delete the prediction endpoint to release the instance associated with it

In [None]:
tuner.delete_endpoint()