Copyright (c) Microsoft Corporation. All rights reserved.

Licensed under the MIT License.

![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/production-deploy-to-aks-gpu/production-deploy-to-aks-gpu.png)

# Deploying a web service hosted on NVIDIA Triton to Azure Kubernetes Service (AKS)
This notebook shows the steps for deploying a service with [NVIDIA Triton Inferencing Server](https://developer.nvidia.com/nvidia-triton-inference-server): registering a model, creating an image, provisioning a cluster (one time action), and deploying a service to it. 
We then test and delete the service, image and model.

In [1]:
import azureml.core
print(azureml.core.VERSION)

Failure while loading azureml_run_type_providers. Failed to load entrypoint hyperdrive = azureml.train.hyperdrive:HyperDriveRun._from_run_dto with exception (azureml-core 1.13.0a0 (/home/ralf/.local/lib/python3.6/site-packages), Requirement.parse('azureml-core~=1.10.0'), {'azureml-telemetry'}).
Failure while loading azureml_run_type_providers. Failed to load entrypoint automl = azureml.train.automl.run:AutoMLRun._from_run_dto with exception (azureml-core 1.13.0a0 (/home/ralf/.local/lib/python3.6/site-packages), Requirement.parse('azureml-core~=1.10.0'), {'azureml-telemetry'}).
Failure while loading azureml_run_type_providers. Failed to load entrypoint azureml.PipelineRun = azureml.pipeline.core.run:PipelineRun._from_dto with exception (azureml-core 1.13.0a0 (/home/ralf/.local/lib/python3.6/site-packages), Requirement.parse('azureml-core~=1.10.0')).
Failure while loading azureml_run_type_providers. Failed to load entrypoint azureml.ReusedStepRun = azureml.pipeline.core.run:StepRun._from

# Get workspace
Load existing workspace from the config file info.

In [3]:
from azureml.core.workspace import Workspace

#ws = Workspace.from_config()
subscription_id = os.getenv("SUBSCRIPTION_ID", default="5f08d643-1910-4a38-a7c7-84a39d4f42e0")
resource_group = os.getenv("RESOURCE_GROUP", default="yifyu")
workspace_name = os.getenv("WORKSPACE_NAME", default="triton2")
ws = Workspace.get(
    subscription_id = subscription_id,
    resource_group = resource_group,
    name = workspace_name)
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

triton2
yifyu
westus
5f08d643-1910-4a38-a7c7-84a39d4f42e0


# Register the model
Register an existing trained model, add description and tags.

** Note: ** Under `model_path` there must be a sub-directory named `triton`, which has the structure of a Triton [Model Repository](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/model_repository.html#repository-layout).

In [35]:
from azureml.core.model import Model

model = Model.register(model_path="models", # This points to the local directory to upload.
                       model_name="densenet_onnx_orig", # This is the name the model is registered as.
                       tags={'area': "Image classification", 'type': "classification"},
                       description="Image classification trained on Imagenet Dataset",
                       workspace=ws)

print(model.name, model.description, model.version)

Registering model densenet_onnx_orig
densenet_onnx_orig Image classification trained on Imagenet Dataset 4


# Provision the AKS Cluster
This is a one time setup. You can reuse this cluster for multiple deployments after it has been created. If you delete the cluster or the resource group that contains it, then you would have to recreate it.

In [8]:
from azureml.core.compute import ComputeTarget, AksCompute
from azureml.core.compute_target import ComputeTargetException

# Choose a name for your GPU cluster
gpu_cluster_name = "aks-cluster-1"

# Verify that cluster does not exist already
try:
    gpu_cluster = ComputeTarget(workspace=ws, name=gpu_cluster_name)
    print("Found existing gpu cluster")
except ComputeTargetException:
    print("Creating new gpu-cluster")
    
    # Specify the configuration for the new cluster
    compute_config = AksCompute.provisioning_configuration(cluster_purpose=AksCompute.ClusterPurpose.DEV_TEST,
                                                           agent_count=1,
                                                           vm_size="Standard_F32s_v2")
    
    # Create the cluster with the specified name and configuration
    gpu_cluster = ComputeTarget.create(ws, gpu_cluster_name, compute_config)

    # Wait for the cluster to complete, show the output log
    gpu_cluster.wait_for_completion(show_output=True)

Creating new gpu-cluster
Creating...........................................................
SucceededProvisioning operation finished, operation "Succeeded"


# Deploy the model as a web service to AKS

First create a scoring script

** Note: ** Triton server listens to a fixed local port. You may choose to use the Triton Python [client library](https://docs.nvidia.com/deeplearning/triton-inference-server/master-user-guide/docs/client_library.html) to talk to it, while keeping the flexibility of pre-/post- processing.

In [36]:
%%writefile score.py
import numpy as np
from PIL import Image
import sys
from functools import partial
import os
import io
import onnxruntime

from azureml.contrib.services.aml_request import AMLRequest, rawhttp
from azureml.contrib.services.aml_response import AMLResponse

sys.path.append(os.path.join(os.getenv('AZUREML_MODEL_DIR'), 'models'))
# sys.path.append('models')
from utils import preprocess, postprocess

_scaling = "INCEPTION"
dtype = np.float32
max_batch_size = 0

def init():
    global session, input_name, output_name, input_shape

    # AZUREML_MODEL_DIR is an environment variable created during deployment.
    # It is the path to the model folder (./azureml-models/$MODEL_NAME/$VERSION)
    # For multiple models, it points to the folder containing all deployed models (./azureml-models)
    model = os.path.join(os.getenv('AZUREML_MODEL_DIR'), 'models', 'model.onnx')
    #model = os.path.join('models', 'model.onnx')
    session = onnxruntime.InferenceSession(model, None)
    input_name = session.get_inputs()[0].name
    input_shape = session.get_inputs()[0].shape
    output_name = session.get_outputs()[0].name
    print("input: ", input_name, ", output: ", output_name, ", input_shape: ", input_shape)


@rawhttp
def run(request):
    if request.method == 'POST':
        
        reqBody = request.get_data(False)
        img = Image.open(io.BytesIO(reqBody))
        
        result = score(img)

        return AMLResponse(result, 200)
    else:
        return AMLResponse("bad request", 500)

def score(data):
    image_data = preprocess(data, _scaling, dtype)
    print(image_data.shape)
    input = np.reshape(image_data, input_shape)
    r = session.run([output_name], {input_name: input})[0]
    print(r.shape)
    res = r.flatten()
    print(len(res))
    result = postprocess(res)
    return result

if __name__ == "__main__":
    init()
    content = Image.open("car.jpg")
    print(score(content))



Overwriting score.py


Now create the deployment configuration objects and deploy the model as a webservice.

In [60]:
# Set the web service configuration (using default here)
from azureml.core.model import InferenceConfig
from azureml.core.webservice import AksWebservice
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.environment import Environment

env = Environment("test1")
conda_dep = CondaDependencies("env.yml")
# client_whl_url = Environment.add_private_pip_wheel(workspace=ws, file_path = os.path.join(target_folder, client_filename), exist_ok=True)
# clientutils_whl_url = Environment.add_private_pip_wheel(workspace=ws, file_path = os.path.join(target_folder, clientutils_filename), exist_ok=True)
# conda_dep.add_pip_package(client_whl_url)
# conda_dep.add_pip_package(clientutils_whl_url)
env.python.conda_dependencies = conda_dep

# Specify the Azure ML Triton base image
# env.docker.base_image = 'mcr.microsoft.com/azureml/openmpi3.1.2-nvidia-tritonserver20.07-py3'

# Optionally specify a worker count to leverage the capability of concurrency and server-side batching from Triton
# env.environment_variables = {"WORKER_COUNT":"128"}

inference_config = InferenceConfig(entry_script="score.py", environment=env)
aks_config = AksWebservice.deploy_configuration(cpu_cores = 30, memory_gb = 40, auth_enabled=False, num_replicas=1, replica_max_concurrent_requests=16)

# # Enable token auth and disable (key) auth on the webservice
# aks_config = AksWebservice.deploy_configuration(cpu_cores = 1, memory_gb = 4, gpu_cores = 1, token_auth_enabled=True, auth_enabled=False)

In [61]:
%%time
aks_service_name ='densenet-onnx-orig-1'

aks_service = Model.deploy(workspace=ws,
                           name=aks_service_name,
                           models=[model],
                           inference_config=inference_config,
                           deployment_config=aks_config,
                           deployment_target=gpu_cluster)

aks_service.wait_for_deployment(show_output = True)
print(aks_service.state)

Running.....
Succeeded
AKS service creation operation finished, operation "Succeeded"
Healthy
CPU times: user 152 ms, sys: 35.3 ms, total: 187 ms
Wall time: 38.9 s


# Test the web service
We test the web sevice by passing the test images content.

In [62]:
%%time
import requests

# if (key) auth is enabled, fetch keys and include in the request
#key1, key2 = aks_service.get_keys()

#headers = {'Content-Type':'application/octet-stream', 'Authorization': 'Bearer ' + key1}
headers = {'Content-Type':'application/octet-stream'}

# # if token auth is enabled, fetch token and include in the request
# access_token, fetch_after = aks_service.get_token()
# headers = {'Content-Type':'application/json', 'Authorization': 'Bearer ' + access_token}

test_sample = open('car.jpg', 'rb').read()
resp = requests.post(aks_service.scoring_uri, test_sample, headers=headers)
print(key1)
print(resp.text)

Received bad response from Model Management Service:
Response Code: 400
Headers: {'Date': 'Thu, 27 Aug 2020 09:55:23 GMT', 'Content-Type': 'application/json', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'Request-Context': 'appId=cid-v1:2d2e8e63-272e-4b3c-8598-4ee570a0e70d', 'x-ms-client-request-id': 'dcaef7312af44e2390e86f25391ca9e6', 'x-ms-client-session-id': '82d4bb2e-29b2-4235-afe7-14b2fc49e973', 'api-supported-versions': '1.0, 2018-03-01-preview, 2018-11-19', 'x-request-time': '0.101', 'Strict-Transport-Security': 'max-age=15724800; includeSubDomains; preload'}
Content: b'{"code":"BadRequest","statusCode":400,"message":"The request is invalid.","details":[{"code":"AuthDisabled","message":"Authentication is disabled (authEnabled set to false). Enable service authentication to list/regenerate keys. Subscription: 5f08d643-1910-4a38-a7c7-84a39d4f42e0, ResourceGroup: yifyu, Workspace: triton2"}],"correlation":{"RequestId":"dcaef7312af44e2390e86f25391ca9e6"}}'



WebserviceException: WebserviceException:
	Message: Received bad response from Model Management Service:
Response Code: 400
Headers: {'Date': 'Thu, 27 Aug 2020 09:55:23 GMT', 'Content-Type': 'application/json', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'Request-Context': 'appId=cid-v1:2d2e8e63-272e-4b3c-8598-4ee570a0e70d', 'x-ms-client-request-id': 'dcaef7312af44e2390e86f25391ca9e6', 'x-ms-client-session-id': '82d4bb2e-29b2-4235-afe7-14b2fc49e973', 'api-supported-versions': '1.0, 2018-03-01-preview, 2018-11-19', 'x-request-time': '0.101', 'Strict-Transport-Security': 'max-age=15724800; includeSubDomains; preload'}
Content: b'{"code":"BadRequest","statusCode":400,"message":"The request is invalid.","details":[{"code":"AuthDisabled","message":"Authentication is disabled (authEnabled set to false). Enable service authentication to list/regenerate keys. Subscription: 5f08d643-1910-4a38-a7c7-84a39d4f42e0, ResourceGroup: yifyu, Workspace: triton2"}],"correlation":{"RequestId":"dcaef7312af44e2390e86f25391ca9e6"}}'
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "Received bad response from Model Management Service:\nResponse Code: 400\nHeaders: {'Date': 'Thu, 27 Aug 2020 09:55:23 GMT', 'Content-Type': 'application/json', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'Request-Context': 'appId=cid-v1:2d2e8e63-272e-4b3c-8598-4ee570a0e70d', 'x-ms-client-request-id': 'dcaef7312af44e2390e86f25391ca9e6', 'x-ms-client-session-id': '82d4bb2e-29b2-4235-afe7-14b2fc49e973', 'api-supported-versions': '1.0, 2018-03-01-preview, 2018-11-19', 'x-request-time': '0.101', 'Strict-Transport-Security': 'max-age=15724800; includeSubDomains; preload'}\nContent: b'{\"code\":\"BadRequest\",\"statusCode\":400,\"message\":\"The request is invalid.\",\"details\":[{\"code\":\"AuthDisabled\",\"message\":\"Authentication is disabled (authEnabled set to false). Enable service authentication to list/regenerate keys. Subscription: 5f08d643-1910-4a38-a7c7-84a39d4f42e0, ResourceGroup: yifyu, Workspace: triton2\"}],\"correlation\":{\"RequestId\":\"dcaef7312af44e2390e86f25391ca9e6\"}}'"
    }
}

In [52]:
URI = aks_service.scoring_uri
!./wrk -c 3 -d 1m -t 1 -s car.lua $URI

thread 1 created
Running 1m test @ http://40.78.16.120:80/api/v1/service/densenet-onnx-orig-1/score
  1 threads and 3 connections
^C
  Thread Stats   Avg      Stdev     Max   +/- Stdev
    Latency   166.99ms   29.66ms 399.26ms   92.26%
    Req/Sec    18.14      4.52    30.00     77.82%
  502 requests in 28.14s, 157.37KB read
Requests/sec:     17.84
Transfer/sec:      5.59KB
thread 1 made 506 requests and got 502 responses, 4 request(s) do(es) not see response
------------------------------

Request = 506; Responses = 502; Missing requests: 4
------------------------------

HTTP Status 2xx Count: 502
HTTP Status 3xx Count: 0
HTTP Status 400 Count: 0
HTTP Status 401 Count: 0
HTTP Status 403 Count: 0
HTTP Status 404 Count: 0
HTTP Status 408 Count: 0
HTTP Status 429 Count: 0
HTTP Status 4xx Count: 0
HTTP Status 500 Count: 0
HTTP Status 501 Count: 0
HTTP Status 502 Count: 0
HTTP Status 503 Count: 0
HTTP Status 504 Count: 0
HTTP Status 5xx Count: 0
HTTP Status xxx Count: 0
------------------

# Clean up
Delete the service, image, model and compute target

In [None]:
%%time
aks_service.delete()
model.delete()
gpu_cluster.delete()

In [20]:
services = AksWebservice.list(ws)

for service in services:
    print(service.name)
    service.delete()

densenet-onnx-orig
densenet-onnx-orig-1
aksservice-update


In [59]:
aks_service.delete()
