### single node triton server with load test
1. Loading to and from minio workflow (multiple models)
2. Getting models from [timm](https://timm.fast.ai/)
3. Making the node
4. Load and unload models operations
5. Monitoring
6. Naive load test
7. TODO Add language models from Huggingface


In [1]:
import timm
import torch
from PIL import Image
import os
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
len(timm.list_models())

739

In [4]:
os.system('sudo umount -l ~/my_mounting_point')
os.system('cc-cloudfuse mount ~/my_mounting_point')

data_folder_path = '/home/cc/my_mounting_point/datasets'
dataset_folder_path = os.path.join(
    data_folder_path, 'ILSVRC/Data/DET/test'
)
classes_file_path = os.path.join(
    data_folder_path, 'imagenet_classes.txt'
)

image_names = os.listdir(dataset_folder_path)
image_names.sort()
with open(classes_file_path) as f:
    classes = [line.strip() for line in f.readlines()]

def image_loader(folder_path, image_name):
    image = Image.open(
        os.path.join(folder_path, image_name))
    # if there was a need to filter out only color images
    # if image.mode == 'RGB':
    #     pass
    return image
num_loaded_images = 4
images = {
    image_name: image_loader(
        dataset_folder_path, image_name) for image_name in image_names[
            :num_loaded_images]}

In [5]:
# load and transform model
from torchvision import transforms

transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(
    mean=[0.485, 0.456, 0.406],
    std=[0.229, 0.224, 0.225]
)])

batch = torch.stack(list(map(lambda a: transform(a), list(images.values()))))

In [None]:
# load and predict with the timm model
model_name = 'resnet50'
model = timm.create_model(model_name, pretrained=True)

model.eval()
torch_output = model(batch)
torch_output = torch.nn.functional.softmax(torch_output, dim=1) * 100
torch_output = torch_output.detach().numpy()
torch_output = torch_output.argmax(axis=1)
torch_class = np.array(classes)[torch_output]
torch_class

In [6]:
# save the onnx model
import torch.onnx

model_variant = 1
model_dir = os.path.join(
    'models',
    model_name,
    str(model_variant))
model_path = os.path.join(model_dir, 'model.onnx')
if 'models' not in os.listdir("./"):
    os.makedirs(model_dir)
# Standard ImageNet input - 3 channels, 224x224,
# values don't matter as we care about network structure.
# But they can also be real inputs.
dummy_input = torch.randn(1, 3, 224, 224)
# Invoke export
torch.onnx.export(
    model, dummy_input,
    model_path,
    input_names = ['input'],
    output_names = ['output'],
    dynamic_axes={'input' : {0 : 'batch_size'},    # variable length axes
                  'output' : {0 : 'batch_size'}})

In [None]:
# use onnx model
import onnx
import onnxruntime

onnx_model = onnx.load(model_path)
onnx.checker.check_model(onnx_model)

ort_session = onnxruntime.InferenceSession(
    os.path.join(model_dir, "model.onnx"),
    providers=['CPUExecutionProvider'])
onnx_output = ort_session.run(None, {'input': batch.numpy()})
onnx_output = torch.nn.functional.softmax(torch.tensor(onnx_output), dim=1)[0] * 100
onnx_output = onnx_output.detach().numpy()
onnx_output = onnx_output.argmax(axis=1)
onnx_class = np.array(classes)[onnx_output]
onnx_class

In [9]:
# TODO find out why slightly different

assert np.all(onnx_output == torch_output)
print(onnx_class)

AssertionError: 

### Deploying on Onnx Docker

In [14]:
%%writefile models/resnet50/config.pbtxt
name: "resnet50"
platform: "onnxruntime_onnx"
max_batch_size : 100
input [
  {
    name: "input"
    data_type: TYPE_FP32
    format: FORMAT_NCHW
    dims: [ 3, 224, 224 ]
  }
]
output [
  {
    name: "output"
    data_type: TYPE_FP32
    dims: [ 1000 ]
  }
]

Overwriting models/resnet50/config.pbtxt


In [None]:
VERSION='22.05'
os.system(f"docker pull nvcr.io/nvidia/tritonserver:{VERSION}-py3")
# add --gpus=<number of gpus> on gpu machines
# add -d to run at background and going to the next cell
os.system("docker run --rm -d -p8000:8000 -p8001:8001 -p8002:8002"
          f" -v {os.getcwd()}/models:/models "
          f"nvcr.io/nvidia/tritonserver:{VERSION}-py3"
          " tritonserver --model-repository=/models")
# print("docker run --rm -d -p8000:8000 -p8001:8001 -p8002:8002"
#       f" -v {os.getcwd()}/models:/models "
#       f"nvcr.io/nvidia/tritonserver:{VERSION}-py3"
#       " tritonserver --strict-model-config=false --model-repository=/models")

### Python Client Examples

[examples](https://github.com/triton-inference-server/client/tree/main/src/python/examples)

[grpc](https://github.com/triton-inference-server/client/blob/main/src/python/library/tritonclient/grpc/__init__.py)

[http](https://github.com/triton-inference-server/client/blob/main/src/python/library/tritonclient/http/__init__.py)

In [None]:
import tritonclient.http as httpclient
from tritonclient.utils import InferenceServerException

try:
    triton_client = httpclient.InferenceServerClient(
        url='localhost:8000', verbose=True
    )
except Exception as e:
    print("context creation failed: " + str(e))
model_name = "resnet50"

inputs = []
inputs.append(
    httpclient.InferInput(name="input", shape=batch[0:4].shape, datatype="FP32")
)
inputs[0].set_data_from_numpy(batch[0:4].numpy(), binary_data=False)

outputs = []
outputs.append(httpclient.InferRequestedOutput(name="output"))

result = triton_client.infer(model_name=model_name, inputs=inputs, outputs=outputs)
triton_client.close()
triton_output = result.as_numpy('output')
triton_output = torch.nn.functional.softmax(
    torch.tensor(triton_output), dim=1) * 100
triton_output = triton_output.detach().numpy()
triton_output = triton_output.argmax(axis=1)
triton_class = np.array(classes)[triton_output]
triton_class

# stop triton server

In [16]:
import tritonclient.grpc as grpcclient
from tritonclient.utils import InferenceServerException

try:
    triton_client = grpcclient.InferenceServerClient(
        url='localhost:8001', verbose=True
    )
except Exception as e:
    print("context creation failed: " + str(e))
model_name = "resnet50"

inputs = []
inputs.append(
    grpcclient.InferInput(name="input", shape=batch[0:4].shape, datatype="FP32")
)
inputs[0].set_data_from_numpy(batch[0:4].numpy())

outputs = []
outputs.append(grpcclient.InferRequestedOutput(name="output"))

result = triton_client.infer(model_name=model_name, inputs=inputs, outputs=outputs)
triton_client.close()
triton_output = result.as_numpy('output')
triton_output = torch.nn.functional.softmax(
    torch.tensor(triton_output), dim=1) * 100
triton_output = triton_output.detach().numpy()
triton_output = triton_output.argmax(axis=1)
triton_class = np.array(classes)[triton_output]
triton_class

# stop triton server

### Multi Model

In [25]:
def config_builder(name: str, platform: str, max_batch_size: int):
  config = (f"name: \"{name}\"\n"
            f"platform: \"{platform}\"\n"
            f"max_batch_size: {str(max_batch_size)}")
  common_config="""
  input [
    {
      name: "input"
      data_type: TYPE_FP32
      format: FORMAT_NCHW
      dims: [ 3, 224, 224 ]
    }
  ]
  output [
    {
      name: "output"
      data_type: TYPE_FP32
      dims: [ 1000 ]
    }
  ]
  """
  return config + common_config

print(config_builder('resnet50', 'onnxruntime_onnx', 100))

name: "resnet50"
platform: "onnxruntime_onnx"
max_batch_size: 100
  input [
    {
      name: "input"
      data_type: TYPE_FP32
      format: FORMAT_NCHW
      dims: [ 3, 224, 224 ]
    }
  ]
  output [
    {
      name: "output"
      data_type: TYPE_FP32
      dims: [ 1000 ]
    }
  ]
  


In [43]:
def generate_model_variants(model_name: str = 'resnet',
    model_variants: list = ['18', '34', '101']):
    # model name
    timm_models = timm.list_models(model_name+'*', pretrained=True)
    model_path = os.path.join(
        'models',
        model_name,
    )
    config_path = os.path.join(
        model_path,
        'config.pbtxt')
    # if 'models' not in os.listdir("./"):
    os.makedirs(model_path)
    config = config_builder(
        name=model_name,
        platform='onnxruntime_onnx',
        max_batch_size=100)
    with open(config_path, 'w') as f:
        f.write(config)
    for variant_id, model_variant in enumerate(model_variants):
        model_full_name = model_name + model_variant
        if not model_full_name in timm_models:
            raise ValueError(
                f"Model {model_full_name} does not exist"
            )
        model = timm.create_model(model_full_name, pretrained=True)
        model.eval()
        dummy_input = torch.randn(1, 3, 224, 224)
        model_variant_dir = os.path.join(model_path, str(variant_id+1))
        model_variant_path = os.path.join(model_variant_dir, 'model.onnx')
        # if 'models' not in os.listdir("./"):
        os.makedirs(model_variant_dir)
        torch.onnx.export(
            model, dummy_input,
            model_variant_path,
            input_names = ['input'],
            output_names = ['output'],
            dynamic_axes={'input' : {0 : 'batch_size'},
                          'output' : {0 : 'batch_size'}})

        

    # load them all on triton
config_builder('resnet50', 'onnxruntime_onnx', 100)
generate_model_variants(
    model_name='resnet',
    model_variants=['18', '34', '101'])
# TODO
# load and predict with the timm model
# model_name = 'resnet50'
# model = timm.create_model(model_name, pretrained=True)

# # check if the model vairants are available
# timm.list_models("resnet*")

# model.eval()
# torch_output = model(batch)
# torch_output = torch.nn.functional.softmax(torch_output, dim=1) * 100
# torch_output = torch_output.detach().numpy()
# torch_output = torch_output.argmax(axis=1)
# torch_class = np.array(classes)[torch_output]
# torch_class

# TODO
# save the onnx model
# import torch.onnx

# model_variant = 1
model_dir = os.path.join(
    'models',
    model_name,
    str(model_variant))
model_path = os.path.join(model_dir, 'model.onnx')
if 'models' not in os.listdir("./"):
    os.makedirs(model_dir)
# # Standard ImageNet input - 3 channels, 224x224,
# # values don't matter as we care about network structure.
# # But they can also be real inputs.
# dummy_input = torch.randn(1, 3, 224, 224)
# # Invoke export
# torch.onnx.export(
#     model, dummy_input,
#     model_path,
#     input_names = ['input'],
#     output_names = ['output'],
#     dynamic_axes={'input' : {0 : 'batch_size'},    # variable length axes
#                   'output' : {0 : 'batch_size'}})



Downloading: "https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet34-43635321.pth" to /home/cc/.cache/torch/hub/checkpoints/resnet34-43635321.pth
Downloading: "https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/resnet101_a1h-36d3f2aa.pth" to /home/cc/.cache/torch/hub/checkpoints/resnet101_a1h-36d3f2aa.pth


In [None]:
# generate models


In [None]:
# send request to multi-models

### Model Switching

### Model load and offload

In [None]:
# timm.list_models("resnet*")

### Seldon

In [4]:
# add model switching etc