In [1]:
import uuid
from azureml.core import Environment, Workspace
from azureml.core.compute import AksCompute, ComputeTarget
from azureml.core.compute_target import ComputeTargetException
from azureml.core.model import InferenceConfig, Model
from azureml.core.webservice import AciWebservice, AksWebservice, Webservice

# Workspace

### Option 1: Create Workspace from Config

In [2]:
ws = Workspace.from_config()

### Option 2: Create Workspace from Connection Info

In [3]:
# ws = Workspace.get(name="sbirk-aml-ws",
#                    subscription_id="bf088f59-f015-4332-bd36-54b988be7c90",
#                    resource_group="sbirk-aml-rg")       

# Model

### Retrieve Registered Model from Workspace

In [4]:
model_name = "cifar10-model"
model = Model(workspace=ws, name=model_name)

# Deployment Environment

### Option 1: Retrieve Environment from Workspace

In [5]:
# env_name = "pytorch-aml-env"
# env = Environment.get(workspace=ws, name=env_name)

### Option 2: Create New Environment

In [19]:
# On the first run in given environment, Azure ML spends some time building the environment.
# On the subsequent runs, Azure ML keeps track of changes and uses the existing environment, 
# resulting in faster run completion.

env = Environment.from_conda_specification(name="pytorch-aml-env",
                                           file_path="environment.yml")

# Use Python dependencies from your Docker image (as opposed to from conda specification)
# env.python.user_managed_dependencies=True

## Only uncomment one of the three below options
# OPTION 1: Use mcr base image
#env.docker.base_image = "mcr.microsoft.com/azureml/intelmpi2018.3-ubuntu16.04:20201113.v1"

# Option 2: Use custom base image from workspace-native ACR
#env.docker.base_image = "eafc0c3ef9714c74a4fa655ee90531ba.azurecr.io/base/pytorch"

# OPTION 3: Use custom base image from standalone ACR. For this you need to enable admin user in the ACR.
env.docker.base_image = "sbirkacr.azurecr.io/base/pytorch"
env.docker.base_image_registry.address = "sbirkacr.azurecr.io"
env.docker.base_image_registry.username = "sbirkacr"
env.docker.base_image_registry.password = "HqAu5Y2We0gZ42IunR5MBXkKc+shf2uj" # replace with Key Vault

# env.environment_variables = {"MESSAGE": "Hello from Azure Machine Learning"}
# This can be retrieved in training script with os.environ.get("MESSAGE")

env.inferencing_stack_version = "latest"
# This will install the inference specific apt packages. This is needed for inferencing images.

# env.register(workspace=ws)

# Inference Artifacts & Configuration

In [20]:
# Create Inference Config
inference_config = InferenceConfig(entry_script="deployment/score.py",
                                   environment=env)

# Deployment

## Option 1: Package Models

[Package a registered model with docker](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-deploy-package-models):

In some cases, you might want to create a Docker image without deploying the model (if, for example, you plan to deploy to Azure App Service). Or you might want to download the image and run it on a local Docker installation. You might even want to download the files used to build the image, inspect them, modify them, and build the image manually.

Model packaging enables you to do these things. It packages all the assets needed to host a model as a web service and allows you to download either a fully built Docker image or the files needed to build one. There are two ways to use model packaging:

**Download a packaged model:** Download a Docker image that contains the model and other files needed to host it as a web service.

**Generate a Dockerfile:** Download the Dockerfile, model, entry script, and other assets needed to build a Docker image. You can then inspect the files or make changes before you build the image locally.

Creating a package is similar to deploying a model. You use a registered model and an inference configuration. The following code builds an image, which is registered in the Azure Container Registry for your workspace.

In [21]:
image_name = "inference/cifar10-pytorch2"
image_label = "1.0" # image tag

# Creating package
package = Model.package(ws, 
                        [model], 
                        inference_config=inference_config, 
                        generate_dockerfile=False,
                        image_name=image_name,
                        image_label=image_label)

package.wait_for_creation(show_output=True)

2021/01/08 14:09:15 Downloading source code...
2021/01/08 14:09:17 Finished downloading source code
2021/01/08 14:09:17 Creating Docker network: acb_default_network, driver: 'bridge'
2021/01/08 14:09:18 Successfully set up Docker network: acb_default_network
2021/01/08 14:09:18 Setting up Docker configuration...
2021/01/08 14:09:18 Successfully set up Docker configuration
2021/01/08 14:09:18 Logging in to registry: eafc0c3ef9714c74a4fa655ee90531ba.azurecr.io
2021/01/08 14:09:24 Successfully logged into eafc0c3ef9714c74a4fa655ee90531ba.azurecr.io
2021/01/08 14:09:24 Executing step ID: acb_step_0. Timeout(sec): 5400, Working directory: '', Network: 'acb_default_network'
2021/01/08 14:09:24 Launching container with name: acb_step_0
https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded
2021/01/08 14:09:26 Successfully executed container: acb_step_0
2021/01/08 14:09:26 Executing step ID: acb_step_1. Timeout(sec): 5400, Working directory: '', Network:

After you create a package, you can use package.pull() to pull the image to your local Docker environment. This can only be used if `generate_dockerfile` is set to `False`. When the package is pulled, use the `docker images` command to list the local images.

In [None]:
# package.pull()

To start a local container based on this image, use the following command to start a named container from the shell or command line. Replace the `<imageid>` value with the image ID returned by the docker images command.

`docker run -p 6789:5001 --name mycontainer <imageid>`

This command starts the latest version of the image named `myimage`. It maps local port 6789 to the port in the container on which the web service is listening (5001). It also assigns the name mycontainer to the container, which makes the container easier to stop. After the container is started, you can submit requests to http://localhost:6789/score.

After you create a Dockerfile, you can use package.save() to download the Dockerfile and corresponding artifacts to your local machine. This can only be used if `generate_dockerfile` is set to `True`.

In [None]:
# package.save("./docker")

## Option 2: Deploy to Compute Target

### Option 1: Azure Container Instances

In [18]:
# aci_config = AciWebservice.deploy_configuration(cpu_cores=1,
#                                                 memory_gb=1,
#                                                 tags={"data": "Cifar",  "method" : "Pytorch"},
#                                                 description="Predict cifar images with a Pytorch CNN")

'aci_config = AciWebservice.deploy_configuration(cpu_cores=1, \n                                               memory_gb=1, \n                                               tags={"data": "Cifar",  "method" : "Pytorch"}, \n                                               description="Predict cifar images with a Pytorch CNN")'

### Option 2: Azure Kubernetes Service

#### Option 2.1: Provisioning New AKS Cluster

In [25]:
# Choose a name for your AKS cluster
aks_name = "my-test-aks"

# Verify that cluster does not exist already
try:
    aks_target = ComputeTarget(workspace=ws, name=aks_name)
    print("Found existing cluster, use it.")
except ComputeTargetException:
    # Use the default configuration (you can also provide parameters to customize this).
    # For example, to create a dev/test cluster, use:
    prov_config = AksCompute.provisioning_configuration(cluster_purpose = AksCompute.ClusterPurpose.DEV_TEST)
    # prov_config = AksCompute.provisioning_configuration()

    # Example configuration to use an existing virtual network
    # prov_config.vnet_name = "mynetwork"
    # prov_config.vnet_resourcegroup_name = "mygroup"
    # prov_config.subnet_name = "default"
    # prov_config.service_cidr = "10.0.0.0/16"
    # prov_config.dns_service_ip = "10.0.0.10"
    # prov_config.docker_bridge_cidr = "172.17.0.1/16"

    # Create the cluster
    aks_target = ComputeTarget.create(workspace = ws,
                                      name = aks_name,
                                      provisioning_configuration = prov_config)

if aks_target.get_status() != "Succeeded":
    aks_target.wait_for_completion(show_output=True)

Creating..
FailedProvisioning operation finished, operation "Failed"


ComputeTargetException: ComputeTargetException:
	Message: Compute object provisioning polling reached non-successful terminal state, current provisioning state: Failed
Provisioning operation error:
{'code': 'InvalidTemplateDeployment', 'message': "The template deployment '43e4aff2-41db-4acc-a8c9-12aec0bf6418' is not valid according to the validation procedure. The tracking id is '54694bd8-1f8d-4c3c-bc20-98d3105fcc9b'. See inner errors for details.", 'details': [{'code': 'InvalidParameter', 'message': 'Provisioning of resource(s) for container service my-test-aksa7061699d9 in resource group rg-msaiempmodeltrainingamlwsf1aplateng-dev-westeurope-01 failed. Message: {\n  "code": "InvalidParameter",\n  "message": "The length of the node resource group name is too long. The maximum length is 80 and the length of the value provided is 92. Please see https://aka.ms/aks-naming-rules for more details.",\n  "target": "name"\n }. Details: '}]}
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "Compute object provisioning polling reached non-successful terminal state, current provisioning state: Failed\nProvisioning operation error:\n{'code': 'InvalidTemplateDeployment', 'message': \"The template deployment '43e4aff2-41db-4acc-a8c9-12aec0bf6418' is not valid according to the validation procedure. The tracking id is '54694bd8-1f8d-4c3c-bc20-98d3105fcc9b'. See inner errors for details.\", 'details': [{'code': 'InvalidParameter', 'message': 'Provisioning of resource(s) for container service my-test-aksa7061699d9 in resource group rg-msaiempmodeltrainingamlwsf1aplateng-dev-westeurope-01 failed. Message: {\\n  \"code\": \"InvalidParameter\",\\n  \"message\": \"The length of the node resource group name is too long. The maximum length is 80 and the length of the value provided is 92. Please see https://aka.ms/aks-naming-rules for more details.\",\\n  \"target\": \"name\"\\n }. Details: '}]}"
    }
}

## Option 2: Attaching

In [19]:
aks_target_name = "newaksinf04"
aks_target = AksCompute(workspace=ws, name=aks_target_name)
print(aks_target.get_status())

Succeeded


In [22]:
aks_service_name ="newendpoint06"
aks_config = AksWebservice.deploy_configuration(cpu_cores = 1, memory_gb = 4)

In [23]:
%%time
aks_service = Model.deploy(workspace=ws,
                           name=aks_service_name,
                           models=[model],
                           inference_config=inference_config,
                           deployment_config=aks_config,
                           deployment_target=aks_target)

aks_service.wait_for_deployment(show_output = True)
print(aks_service.state)

Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running..
Failed


ERROR - Service deployment polling reached non-successful terminal state, current service state: Failed
Operation ID: 0d9dbf05-cd43-48fa-8d89-be54d7536287
More information can be found using '.get_logs()'
Error:
{
  "code": "Forbidden",
  "statusCode": 403,
  "message": "Forbidden",
  "details": [
    {
      "code": "KubernetesError",
      "message": "Kubernetes error: Forbidden. Reason: {\"kind\":\"Status\",\"apiVersion\":\"v1\",\"metadata\":{},\"status\":\"Failure\",\"message\":\"configmaps \\\"newendpoint06c6f00dadb1d747b6aafb199b6e6ea60c-config\\\" is forbidden: unable to create new content in namespace azureml-amls-msai-f1a-dev-westeurope-01 because it is being terminated\",\"reason\":\"Forbidden\",\"details\":{\"name\":\"newendpoint06c6f00dadb1d747b6aafb199b6e6ea60c-config\",\"kind\":\"configmaps\",\"causes\":[{\"reason\":\"NamespaceTerminating\",\"message\":\"namespace azureml-amls-msai-f1a-dev-westeurope-01 is being terminated\",\"field\":\"metadata.namespace\"}]},\"code\":40

WebserviceException: WebserviceException:
	Message: Service deployment polling reached non-successful terminal state, current service state: Failed
Operation ID: 0d9dbf05-cd43-48fa-8d89-be54d7536287
More information can be found using '.get_logs()'
Error:
{
  "code": "Forbidden",
  "statusCode": 403,
  "message": "Forbidden",
  "details": [
    {
      "code": "KubernetesError",
      "message": "Kubernetes error: Forbidden. Reason: {\"kind\":\"Status\",\"apiVersion\":\"v1\",\"metadata\":{},\"status\":\"Failure\",\"message\":\"configmaps \\\"newendpoint06c6f00dadb1d747b6aafb199b6e6ea60c-config\\\" is forbidden: unable to create new content in namespace azureml-amls-msai-f1a-dev-westeurope-01 because it is being terminated\",\"reason\":\"Forbidden\",\"details\":{\"name\":\"newendpoint06c6f00dadb1d747b6aafb199b6e6ea60c-config\",\"kind\":\"configmaps\",\"causes\":[{\"reason\":\"NamespaceTerminating\",\"message\":\"namespace azureml-amls-msai-f1a-dev-westeurope-01 is being terminated\",\"field\":\"metadata.namespace\"}]},\"code\":403}\n"
    }
  ]
}
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "Service deployment polling reached non-successful terminal state, current service state: Failed\nOperation ID: 0d9dbf05-cd43-48fa-8d89-be54d7536287\nMore information can be found using '.get_logs()'\nError:\n{\n  \"code\": \"Forbidden\",\n  \"statusCode\": 403,\n  \"message\": \"Forbidden\",\n  \"details\": [\n    {\n      \"code\": \"KubernetesError\",\n      \"message\": \"Kubernetes error: Forbidden. Reason: {\\\"kind\\\":\\\"Status\\\",\\\"apiVersion\\\":\\\"v1\\\",\\\"metadata\\\":{},\\\"status\\\":\\\"Failure\\\",\\\"message\\\":\\\"configmaps \\\\\\\"newendpoint06c6f00dadb1d747b6aafb199b6e6ea60c-config\\\\\\\" is forbidden: unable to create new content in namespace azureml-amls-msai-f1a-dev-westeurope-01 because it is being terminated\\\",\\\"reason\\\":\\\"Forbidden\\\",\\\"details\\\":{\\\"name\\\":\\\"newendpoint06c6f00dadb1d747b6aafb199b6e6ea60c-config\\\",\\\"kind\\\":\\\"configmaps\\\",\\\"causes\\\":[{\\\"reason\\\":\\\"NamespaceTerminating\\\",\\\"message\\\":\\\"namespace azureml-amls-msai-f1a-dev-westeurope-01 is being terminated\\\",\\\"field\\\":\\\"metadata.namespace\\\"}]},\\\"code\\\":403}\\n\"\n    }\n  ]\n}"
    }
}

In [None]:
%%time
env = Environment.get(workspace=ws, name="pytorch-aml-env")

inference_config = InferenceConfig(entry_script="deployment/score.py",
                                   environment=env)

service_name = "Cifar-pytorch-service" + str(uuid.uuid4())[:4]

service = Model.deploy(workspace=ws, 
                       name=service_name, 
                       models=[model], 
                       inference_config=inference_config, 
                       deployment_config=aci_config)

service.wait_for_deployment(show_output=True)

In [None]:
print(service.scoring_uri)

# Test the model

In [None]:
from utils import load_data
import os
import glob

data_folder = os.path.join(os.getcwd(), 'data')
# note we also shrink the intensity values (X) from 0-255 to 0-1. This helps the neural network converge faster
X_test = load_data(glob.glob(os.path.join(data_folder,"**/t10k-images-idx3-ubyte.gz"), recursive=True)[0], False) / 255.0
y_test = load_data(glob.glob(os.path.join(data_folder,"**/t10k-labels-idx1-ubyte.gz"), recursive=True)[0], True).reshape(-1)

In [None]:
import os
from azureml.core import Dataset
from azureml.opendatasets import MNIST

data_folder = os.path.join(os.getcwd(), 'data')
os.makedirs(data_folder, exist_ok=True)

mnist_file_dataset = MNIST.get_file_dataset()
mnist_file_dataset.download(data_folder, overwrite=True)

In [None]:
import json
test = json.dumps({"data": X_test.tolist()})
test = bytes(test, encoding='utf8')
y_hat = service.run(input_data=test)

In [None]:
from sklearn.metrics import confusion_matrix

conf_mx = confusion_matrix(y_test, y_hat)
print(conf_mx)
print('Overall accuracy:', np.average(y_hat == y_test))

In [None]:
# normalize the diagonal cells so that they don't overpower the rest of the cells when visualized
row_sums = conf_mx.sum(axis=1, keepdims=True)
norm_conf_mx = conf_mx / row_sums
np.fill_diagonal(norm_conf_mx, 0)

fig = plt.figure(figsize=(8,5))
ax = fig.add_subplot(111)
cax = ax.matshow(norm_conf_mx, cmap=plt.cm.bone)
ticks = np.arange(0, 10, 1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(ticks)
ax.set_yticklabels(ticks)
fig.colorbar(cax)
plt.ylabel('true labels', fontsize=14)
plt.xlabel('predicted values', fontsize=14)
plt.savefig('conf.png')
plt.show()

In [None]:
import json

# find 30 random samples from test set
n = 30
sample_indices = np.random.permutation(X_test.shape[0])[0:n]

test_samples = json.dumps({"data": X_test[sample_indices].tolist()})
test_samples = bytes(test_samples, encoding='utf8')

# predict using the deployed model
result = service.run(input_data=test_samples)

# compare actual value vs. the predicted values:
i = 0
plt.figure(figsize = (20, 1))

for s in sample_indices:
    plt.subplot(1, n, i + 1)
    plt.axhline('')
    plt.axvline('')
    
    # use different color for misclassified sample
    font_color = 'red' if y_test[s] != result[i] else 'black'
    clr_map = plt.cm.gray if y_test[s] != result[i] else plt.cm.Greys
    
    plt.text(x=10, y =-10, s=result[i], fontsize=18, color=font_color)
    plt.imshow(X_test[s].reshape(28, 28), cmap=clr_map)
    
    i = i + 1
plt.show()

In [None]:
import requests

# send a random row from the test set to score
random_index = np.random.randint(0, len(X_test)-1)
input_data = "{\"data\": [" + str(list(X_test[random_index])) + "]}"

headers = {'Content-Type':'application/json'}

# for AKS deployment you'd need to the service key in the header as well
# api_key = service.get_key()
# headers = {'Content-Type':'application/json',  'Authorization':('Bearer '+ api_key)} 

resp = requests.post(service.scoring_uri, input_data, headers=headers)

print("POST to url", service.scoring_uri)
#print("input data:", input_data)
print("label:", y_test[random_index])
print("prediction:", resp.text)

In [None]:
service.delete()