### Dynamic batching

#### Python

In [1]:
!mkdir -p ../models/python_dynamic_batching/1

In [2]:
VERBOSE = False
input_name = ['input_ids', 'attention_mask']
input_dtype = ['INT32', 'INT32']
output_name = ['OUTPUT0']
model_name = 'python_dynamic_batching'
url = '0.0.0.0:8000'
model_version = '1'

In [3]:
configuration = """
name: "python_dynamic_batching"
backend: "python"
max_batch_size: 32

dynamic_batching { 
  preferred_batch_size: [ 4, 8, 16, 32 ] 
  max_queue_delay_microseconds: 3000000
}

input [
  {
    name: "input_ids"
    data_type: TYPE_INT32
    dims: [ 256 ]
  }
]
input [
  {
    name: "attention_mask"
    data_type: TYPE_INT32
    dims: [ 256 ]
  }
]
output [
  {
    name: "OUTPUT0"
    data_type: TYPE_FP32
    dims: [ 28 ]
  }
]
"""


with open('../models/python_dynamic_batching/config.pbtxt', 'w') as f:
    f.write(configuration)

In [4]:
!curl -v 0.0.0.0:8000/v2/health/ready

*   Trying 0.0.0.0:8000...
* Connected to 0.0.0.0 (127.0.0.1) port 8000 (#0)
> GET /v2/health/ready HTTP/1.1
> Host: 0.0.0.0:8000
> User-Agent: curl/7.86.0
> Accept: */*
> 
* Mark bundle as not supporting multiuse
< HTTP/1.1 200 OK
< Content-Length: 0
< Content-Type: text/plain
< 
* Connection #0 to host 0.0.0.0 left intact


In [5]:
!curl -v 0.0.0.0:8000/v2/models/python_dynamic_batching

*   Trying 0.0.0.0:8000...
* Connected to 0.0.0.0 (127.0.0.1) port 8000 (#0)
> GET /v2/models/python_dynamic_batching HTTP/1.1
> Host: 0.0.0.0:8000
> User-Agent: curl/7.86.0
> Accept: */*
> 
* Mark bundle as not supporting multiuse
< HTTP/1.1 200 OK
< Content-Type: application/json
< Content-Length: 266
< 
* Connection #0 to host 0.0.0.0 left intact
{"name":"python_dynamic_batching","versions":["1"],"platform":"python","inputs":[{"name":"input_ids","datatype":"INT32","shape":[-1,256]},{"name":"attention_mask","datatype":"INT32","shape":[-1,256]}],"outputs":[{"name":"OUTPUT0","datatype":"FP32","shape":[-1,28]}]}

In [6]:
import tritonclient.http as tritonhttpclient
from transformers import AutoTokenizer
import time
import numpy as np

import os
import json

os.environ['TOKENIZERS_PARALLELISM'] = 'False'

  from .autonotebook import tqdm as notebook_tqdm
2023-02-28 23:22:11.293166: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
VERBOSE = False
input_name = ['input_ids', 'attention_mask']
input_dtype = ['INT32', 'INT32']
output_name = ['OUTPUT0']
model_name = 'python_dynamic_batching'
url = '0.0.0.0:8000'
model_version = '1'

In [8]:
with open('../weights/config.json', 'r') as f:
    id2label = json.load(f)['id2label']

In [9]:
tokenizer = AutoTokenizer.from_pretrained('../weights/')
text = 'I feel lucky to be here.'

In [10]:
%%time

client = tritonhttpclient.InferenceServerClient(url=url, verbose=False, concurrency=32)
# Encode the data using tokenizer
inputs = tokenizer(text, return_tensors="pt", max_length=256, padding='max_length')
input_ids = np.array(inputs['input_ids'], dtype=np.int32)
attention_mask = np.array(inputs['attention_mask'], dtype=np.int32)
tick = time.time()

# Define input config
inputs = [
    tritonhttpclient.InferInput(input_name[0], input_ids.shape, input_dtype[0]),
    tritonhttpclient.InferInput(input_name[1], attention_mask.shape, input_dtype[1]),
]

# Attach input
inputs[0].set_data_from_numpy(input_ids)
inputs[1].set_data_from_numpy(attention_mask)

# Define output config
outputs = [
    tritonhttpclient.InferRequestedOutput(output_name[0]),
]

# Hit triton server
n_requests = 4
responses = []

for i in range(n_requests):
    responses.append(client.async_infer(model_name, model_version=model_version, inputs=inputs, outputs=outputs))
tock = time.time()
print(f'Time taken: {tock - tick}')

Time taken: 0.047541141510009766
CPU times: user 745 ms, sys: 80.8 ms, total: 825 ms
Wall time: 866 ms


In [11]:
result = responses[0].get_result().as_numpy(output_name[0])
id2label[str(result[0].argmax())]

'relief'

#### Onnx

In [12]:
!mkdir -p ../models/onnx_dynamic_batching/1

In [13]:
VERBOSE = False
input_name = ['input_ids', 'attention_mask']
input_dtype = ['INT32', 'INT32']
output_name = ['OUTPUT0']
model_name = 'onnx_dynamic_batching'
url = '0.0.0.0:8000'
model_version = '1'

1. `preferred_batch_size`: batches that the inference server should attempt to create.
2. `max_queue_delay_microseconds`: If the `preferred_batch_size` can't be created, the server will delay until no request waits for more than `max_queue_delay_microseconds`.

In [14]:
configuration = """
name: "onnx_dynamic_batching"
platform: "onnxruntime_onnx"
max_batch_size: 32
dynamic_batching { 
  preferred_batch_size: [ 4, 8, 16, 32 ] 
  max_queue_delay_microseconds: 3000000
}

input [
  {
    name: "input_ids"
    data_type: TYPE_INT32
    dims: [ 256 ]
  }
]
input [
  {
    name: "attention_mask"
    data_type: TYPE_INT32
    dims: [ 256 ]
  }
]
output [
  {
    name: "OUTPUT0"
    data_type: TYPE_FP32
    dims: [ 28 ]
  }
]
"""


with open('../models/onnx_dynamic_batching/config.pbtxt', 'w') as f:
    f.write(configuration)

In [15]:
!cp -r ../models/onnx/1/ ../models/onnx_dynamic_batching/1

In [16]:
!curl -v 0.0.0.0:8000/v2/health/ready

*   Trying 0.0.0.0:8000...
* Connected to 0.0.0.0 (127.0.0.1) port 8000 (#0)
> GET /v2/health/ready HTTP/1.1
> Host: 0.0.0.0:8000
> User-Agent: curl/7.86.0
> Accept: */*
> 
* Mark bundle as not supporting multiuse
< HTTP/1.1 200 OK
< Content-Length: 0
< Content-Type: text/plain
< 
* Connection #0 to host 0.0.0.0 left intact


In [17]:
!curl -v 0.0.0.0:8000/v2/models/onnx_dynamic_batching

*   Trying 0.0.0.0:8000...
* Connected to 0.0.0.0 (127.0.0.1) port 8000 (#0)
> GET /v2/models/onnx_dynamic_batching HTTP/1.1
> Host: 0.0.0.0:8000
> User-Agent: curl/7.86.0
> Accept: */*
> 
* Mark bundle as not supporting multiuse
< HTTP/1.1 200 OK
< Content-Type: application/json
< Content-Length: 274
< 
* Connection #0 to host 0.0.0.0 left intact
{"name":"onnx_dynamic_batching","versions":["1"],"platform":"onnxruntime_onnx","inputs":[{"name":"input_ids","datatype":"INT32","shape":[-1,256]},{"name":"attention_mask","datatype":"INT32","shape":[-1,256]}],"outputs":[{"name":"OUTPUT0","datatype":"FP32","shape":[-1,28]}]}

In [18]:
import tritonclient.http as tritonhttpclient
from transformers import AutoTokenizer
import time
import numpy as np

import os
import json

os.environ['TOKENIZERS_PARALLELISM'] = 'False'

In [19]:
VERBOSE = False
input_name = ['input_ids', 'attention_mask']
input_dtype = ['INT32', 'INT32']
output_name = ['OUTPUT0']
model_name = 'onnx_dynamic_batching'
url = '0.0.0.0:8000'
model_version = '1'

In [20]:
with open('../weights/config.json', 'r') as f:
    id2label = json.load(f)['id2label']

In [21]:
tokenizer = AutoTokenizer.from_pretrained('../weights/')
text = 'I feel lucky to be here.'

In [22]:
%%time

with tritonhttpclient.InferenceServerClient(url=url, verbose=False, concurrency=32) as client:
    # Encode the data using tokenizer
    inputs = tokenizer(text, return_tensors="pt", max_length=256, padding='max_length')
    input_ids = np.array(inputs['input_ids'], dtype=np.int32)
    attention_mask = np.array(inputs['attention_mask'], dtype=np.int32)
    tick = time.time()
    
    # Define input config
    inputs = [
        tritonhttpclient.InferInput(input_name[0], input_ids.shape, input_dtype[0]),
        tritonhttpclient.InferInput(input_name[1], attention_mask.shape, input_dtype[1]),
    ]
    
    # Attach input
    inputs[0].set_data_from_numpy(input_ids)
    inputs[1].set_data_from_numpy(attention_mask)
    
    # Define output config
    outputs = [
        tritonhttpclient.InferRequestedOutput(output_name[0]),
    ]
    
    # Hit triton server
    n_requests = 4
    responses = []
    
    for i in range(n_requests):
        responses.append(client.async_infer(model_name, model_version=model_version, inputs=inputs, outputs=outputs))
    tock = time.time()
    print(f'Time taken: {tock - tick}')

Time taken: 0.04227113723754883
CPU times: user 5.95 ms, sys: 2.86 ms, total: 8.81 ms
Wall time: 488 ms


In [23]:
result = responses[0].get_result().as_numpy(output_name[0])
id2label[str(result[0].argmax())]

'relief'

#### With GRPC

In [24]:
import tritonclient.grpc as tritongrpcclient

In [25]:
VERBOSE = False
input_name = ['input_ids', 'attention_mask']
input_dtype = ['INT32', 'INT32']
output_name = ['OUTPUT0']
model_name = 'onnx_dynamic_batching'
url = '0.0.0.0:8001'
model_version = '1'

In [26]:
from functools import partial

In [27]:
%%time

client = tritongrpcclient.InferenceServerClient(url=url, verbose=False)
results = []

def callback(user_data, result, error):
    if error:
        user_data.append(error)
    else:
        user_data.append(result)

# Encode the data using tokenizer
inputs = tokenizer(text, return_tensors="pt", max_length=256, padding='max_length')
input_ids = np.array(inputs['input_ids'], dtype=np.int32)
attention_mask = np.array(inputs['attention_mask'], dtype=np.int32)
tick = time.time()

# Define input config
inputs = [
    tritongrpcclient.InferInput(input_name[0], input_ids.shape, input_dtype[0]),
    tritongrpcclient.InferInput(input_name[1], attention_mask.shape, input_dtype[1]),
]

# Attach input
inputs[0].set_data_from_numpy(input_ids)
inputs[1].set_data_from_numpy(attention_mask)

# Define output config
outputs = [
    tritongrpcclient.InferRequestedOutput(output_name[0]),
]

# Hit triton server
n_requests = 4
responses = []

for i in range(n_requests):
    responses.append(client.async_infer(model_name, model_version=model_version, inputs=inputs, outputs=outputs, callback=partial(callback, results)))
tock = time.time()
print(f'Time taken: {tock - tick}')

Time taken: 0.0011510848999023438
CPU times: user 4.33 ms, sys: 2.1 ms, total: 6.43 ms
Wall time: 5.53 ms


In [28]:
result = results[0].as_numpy(output_name[0])
id2label[str(result[0].argmax())]

'relief'