### Dynamic batching

In [None]:
!mkdir -p ../models/onnx_dynamic_batching/1

In [None]:
VERBOSE = False
input_name = ['INPUT0', 'INPUT1']
input_dtype = ['INT32', 'INT32']
output_name = ['OUTPUT0']
model_name = 'onnx_dynamic_batching'
url = '0.0.0.0:8000'
model_version = '1'

1. `preferred_batch_size`: batches that the inference server should attempt to create.
2. `max_queue_delay_microseconds`: If the `preferred_batch_size` can't be created, the server will delay until no request waits for more than `max_queue_delay_microseconds`.

In [None]:
configuration = """
name: "onnx_dynamic_batching"
platform: "onnxruntime_onnx"
max_batch_size: 32
dynamic_batching { 
  preferred_batch_size: [ 4, 8, 16, 32 ] 
  max_queue_delay_microseconds: 3000000
}

input [
  {
    name: "INPUT0"
    data_type: TYPE_INT32
    dims: [ 256 ]
  }
]
input [
  {
    name: "INPUT1"
    data_type: TYPE_INT32
    dims: [ 256 ]
  }
]
output [
  {
    name: "OUTPUT0"
    data_type: TYPE_FP32
    dims: [ 28 ]
  }
]
"""


with open('../models/onnx_dynamic_batching/config.pbtxt', 'w') as f:
    f.write(configuration)

In [None]:
!cp -r ../models/onnx/1/ ../models/onnx_dynamic_batching/1

In [None]:
!curl -v 0.0.0.0:8000/v2/health/ready

In [None]:
!curl -v 0.0.0.0:8000/v2/models/onnx_dynamic_batching

In [None]:
import tritonclient.http as tritonhttpclient
from transformers import AutoTokenizer
import time
import numpy as np

import os
import json

os.environ['TOKENIZERS_PARALLELISM'] = 'False'

In [None]:
VERBOSE = False
input_name = ['INPUT0', 'INPUT1']
input_dtype = ['INT32', 'INT32']
output_name = ['OUTPUT0']
model_name = 'onnx_dynamic_batching'
url = '0.0.0.0:8000'
model_version = '1'

In [None]:
with open('../weights/config.json', 'r') as f:
    id2label = json.load(f)['id2label']

In [None]:
tokenizer = AutoTokenizer.from_pretrained('../weights/')
text = 'I feel lucky to be here.'

In [None]:
%%time

with tritonhttpclient.InferenceServerClient(url=url, verbose=False, concurrency=32) as client:
    # Encode the data using tokenizer
    inputs = tokenizer(text, return_tensors="pt", max_length=256, padding='max_length')
    input_ids = np.array(inputs['input_ids'], dtype=np.int32)
    attention_mask = np.array(inputs['attention_mask'], dtype=np.int32)
    tick = time.time()
    
    # Define input config
    inputs = [
        tritonhttpclient.InferInput(input_name[0], input_ids.shape, input_dtype[0]),
        tritonhttpclient.InferInput(input_name[1], attention_mask.shape, input_dtype[1]),
    ]
    
    # Attach input
    inputs[0].set_data_from_numpy(input_ids)
    inputs[1].set_data_from_numpy(attention_mask)
    
    # Define output config
    outputs = [
        tritonhttpclient.InferRequestedOutput(output_name[0]),
    ]
    
    # Hit triton server
    n_requests = 4
    responses = []
    
    for i in range(n_requests):
        responses.append(client.async_infer(model_name, model_version=model_version, inputs=inputs, outputs=outputs))
    tock = time.time()
    print(f'Time taken: {tock - tick}')

In [None]:
result = responses[0].get_result().as_numpy(output_name[0])
id2label[str(result[0].argmax())]

#### With GRPC

In [None]:
import tritonclient.grpc as tritongrpcclient

In [None]:
VERBOSE = False
input_name = ['INPUT0', 'INPUT1']
input_dtype = ['INT32', 'INT32']
output_name = ['OUTPUT0']
model_name = 'onnx_dynamic_batching'
url = '0.0.0.0:8001'
model_version = '1'

In [None]:
from functools import partial

In [None]:
%%time

client = tritongrpcclient.InferenceServerClient(url=url, verbose=False)
results = []

def callback(user_data, result, error):
    if error:
        user_data.append(error)
    else:
        user_data.append(result)

# Encode the data using tokenizer
inputs = tokenizer(text, return_tensors="pt", max_length=256, padding='max_length')
input_ids = np.array(inputs['input_ids'], dtype=np.int32)
attention_mask = np.array(inputs['attention_mask'], dtype=np.int32)
tick = time.time()

# Define input config
inputs = [
    tritongrpcclient.InferInput(input_name[0], input_ids.shape, input_dtype[0]),
    tritongrpcclient.InferInput(input_name[1], attention_mask.shape, input_dtype[1]),
]

# Attach input
inputs[0].set_data_from_numpy(input_ids)
inputs[1].set_data_from_numpy(attention_mask)

# Define output config
outputs = [
    tritongrpcclient.InferRequestedOutput(output_name[0]),
]

# Hit triton server
n_requests = 4
responses = []

for i in range(n_requests):
    responses.append(client.async_infer(model_name, model_version=model_version, inputs=inputs, outputs=outputs, callback=partial(callback, results)))
tock = time.time()
print(f'Time taken: {tock - tick}')

In [None]:
result = results[0].as_numpy(output_name[0])
id2label[str(result[0].argmax())]