# Overall

How triton inference server is configured:

1. Choose the Tensorflow model.
2. Choose the backend or platform you want to deploy your model to.
3. Set config and model checkpoints for the compiled model. The config will contain info about the backend/platform, input and output.
4. Check if triton has loaded it or not.
5. If loaded, define the input in tritonclient input wrapper and hit the API.

### Baseline

In [None]:
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
import time

import os

os.environ['TOKENIZERS_PARALLELISM'] = 'False'

model_name = 'joeddav/distilbert-base-uncased-go-emotions-student'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, return_dict=False)

inputs = tokenizer("I feel lucky to be here.", return_tensors="tf", max_length=256, padding='max_length')

tick = time.time()
logits, = model(**inputs)
    
tock = time.time()
print(f'Time taken: {tock - tick}')

predicted_label = model.config.id2label[tf.argmax(logits[0]).numpy()]
predicted_label

In [None]:
tokenizer.save_pretrained('../weights_tf')
model.save_pretrained('../weights_tf')

### Onnx backend

In [None]:
### Let's restart the notebook

import IPython
IPython.Application.instance().kernel.do_shutdown(True)

In [None]:
!mkdir -p ../models/onnx_tf/1

In [None]:
input_name = ['INPUT0', 'INPUT1']

In [None]:
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
import tf2onnx
import time
import json
from onnxruntime import InferenceSession
import os
import numpy as np

os.environ['TOKENIZERS_PARALLELISM'] = 'False'


model_name = '../weights_tf'
tokenizer = AutoTokenizer.from_pretrained(model_name)
inputs = tokenizer("I feel lucky to be here.", return_tensors="tf", max_length=256, padding='max_length')

model = TFAutoModelForSequenceClassification.from_pretrained(model_name, return_dict=False)

# describe the inputs
input_spec = (
    tf.TensorSpec((None,  None), tf.int32, name=input_name[0]),
    tf.TensorSpec((None,  None), tf.int32, name=input_name[1])
)

# and convert
model_proto, _ = tf2onnx.convert.from_keras(model, input_signature=input_spec, opset=13, output_path='../models/onnx_tf/1/model.onnx')
output_name = [n.name for n in model_proto.graph.output]

with open('../weights/config.json', 'r') as f:
    id2label = json.load(f)['id2label']

inputs = tokenizer("I feel lucky to be here.", return_tensors="np", max_length=256, padding='max_length')
session = InferenceSession("../models/onnx_tf/1/model.onnx")

tick = time.time()
logits = session.run(output_names=output_name, input_feed={input_name[0]: inputs['input_ids'].astype(np.int32), input_name[1]: inputs['attention_mask'].astype(np.int32)})
    
tock = time.time()
print(f'Time taken: {tock - tick}')

id2label[str(logits[0][0].argmax())]

In [None]:
output_names = [n.name for n in model_proto.graph.output]
output_names

#### Send request to server

In [None]:
### Let's restart the notebook

import IPython
IPython.Application.instance().kernel.do_shutdown(True)

In [None]:
!curl -v 0.0.0.0:8000/v2/health/ready

In [None]:
!curl -v 0.0.0.0:8000/v2/models/onnx_tf

In [None]:
import tritonclient.http as tritonhttpclient
from transformers import AutoTokenizer
import time
import numpy as np

import os
import json

os.environ['TOKENIZERS_PARALLELISM'] = 'False'

In [None]:
configuration = """
name: "onnx"
platform: "onnxruntime_onnx"
max_batch_size: 32

input [
  {
    name: "INPUT0"
    data_type: TYPE_INT32
    dims: [ 256 ]
  }
]
input [
  {
    name: "INPUT1"
    data_type: TYPE_INT32
    dims: [ 256 ]
  }
]
output [
  {
    name: "output_1"
    data_type: TYPE_FP32
    dims: [ 28 ]
  }
]
"""


with open('../models/onnx_tf/config.pbtxt', 'w') as f:
    f.write(configuration)

In [None]:
VERBOSE = False
input_name = ['INPUT0', 'INPUT1']
input_dtype = ['INT32', 'INT32']
output_name = ['output_1']
model_name = 'onnx_tf'
url = '0.0.0.0:8000'
model_version = '1'

In [None]:
with open('../weights/config.json', 'r') as f:
    id2label = json.load(f)['id2label']

In [None]:
tokenizer = AutoTokenizer.from_pretrained('../weights/')
text = 'I feel lucky to be here.'

In [None]:
with tritonhttpclient.InferenceServerClient(url=url, verbose=False) as client:
    # Encode the data using tokenizer
    inputs = tokenizer(text, return_tensors="pt", max_length=256, padding='max_length')
    input_ids = np.array(inputs['input_ids'], dtype=np.int32)
    attention_mask = np.array(inputs['attention_mask'], dtype=np.int32)
    tick = time.time()
    
    # Define input config
    inputs = [
        tritonhttpclient.InferInput(input_name[0], input_ids.shape, input_dtype[0]),
        tritonhttpclient.InferInput(input_name[1], attention_mask.shape, input_dtype[1]),
    ]
    
    # Attach input
    inputs[0].set_data_from_numpy(input_ids)
    inputs[1].set_data_from_numpy(attention_mask)
    
    # Define output config
    outputs = [
        tritonhttpclient.InferRequestedOutput(output_name[0]),
    ]
    
    # Hit triton server
    response = client.infer(model_name, model_version=model_version, inputs=inputs, outputs=outputs)
    logits = response.as_numpy(output_name[0])
    tock = time.time()
    print(f'Time taken: {tock - tick}')

In [None]:
id2label[str(logits[0].argmax())]

### Triton backend (GPU only)

#### Installation Guide

Check the README.

#### Create models

Let's create 2 plans, one for fp32 and other one for fp16 (faster and uses less memory).

#### FP32

In [None]:
!mkdir -p ../models/tensorrt_fp32_tf/1

In [None]:
!trtexec --onnx=../models/onnx_tf/1/model.onnx --optShapes=INPUT0:16x256,INPUT1:16x256 --maxShapes=INPUT0:32x256,INPUT1:32x256 --minShapes=INPUT0:1x256,INPUT1:1x256 --shapes=INPUT0:1x256,INPUT1:1x256 --saveEngine=../models/tensorrt_fp32_tf/1/model.plan

In [None]:
!curl -v 0.0.0.0:8000/v2/health/ready

In [None]:
!curl -v 0.0.0.0:8000/v2/models/tensorrt_fp32_tf

In [None]:
configuration = """
name: "tensorrt_fp32_tf"
platform: "tensorrt_plan"
max_batch_size: 32

input [
 {
    name: "INPUT0"
    data_type: TYPE_INT32
    dims: [ 256 ]
  } ,
{
    name: "INPUT1"
    data_type: TYPE_INT32
    dims: [ 256 ]
  }
]
output {
    name: "OUTPUT0"
    data_type: TYPE_FP32
    dims: [ 28 ]
}
"""

with open('../models/tensorrt_fp32/config.pbtxt', 'w') as f:
    f.write(configuration)

In [None]:
import tritonclient.http as tritonhttpclient
from transformers import AutoTokenizer
import time
import numpy as np

import os
import json

os.environ['TOKENIZERS_PARALLELISM'] = 'False'

In [None]:
VERBOSE = False
input_name = ['INPUT0', 'INPUT1']
input_dtype = ['INT32', 'INT32']
output_name = ['OUTPUT0']
model_name = 'tensorrt_fp32_tf'
url = '0.0.0.0:8000'
model_version = '1'

In [None]:
tokenizer = AutoTokenizer.from_pretrained('../weights/')
text = 'I feel lucky to be here.'

In [None]:
with tritonhttpclient.InferenceServerClient(url=url, verbose=False) as client:
    # Encode the data using tokenizer
    inputs = tokenizer(text, return_tensors="pt", max_length=256, padding='max_length')
    input_ids = np.array(inputs['input_ids'], dtype=np.int32)
    attention_mask = np.array(inputs['attention_mask'], dtype=np.int32)
    tick = time.time()
    
    # Define input config
    inputs = [
        tritonhttpclient.InferInput(input_name[0], input_ids.shape, input_dtype[0]),
        tritonhttpclient.InferInput(input_name[1], attention_mask.shape, input_dtype[1]),
    ]
    
    # Attach input
    inputs[0].set_data_from_numpy(input_ids)
    inputs[1].set_data_from_numpy(attention_mask)
    
    # Define output config
    outputs = [
        tritonhttpclient.InferRequestedOutput(output_name[0]),
    ]
    
    # Hit triton server
    response = client.infer(model_name, model_version=model_version, inputs=inputs, outputs=outputs)
    logits = response.as_numpy(output_name[0])
    tock = time.time()
    print(f'Time taken: {tock - tick}')

In [None]:
id2label[str(logits.argmax())]

#### FP16

In [None]:
!mkdir -p ../models/tensorrt_fp16_tf/1


In [None]:
!trtexec --onnx=../models/onnx_tf/1/model.onnx --optShapes=INPUT0:16x256,INPUT1:16x256 --maxShapes=INPUT0:32x256,INPUT1:32x256 --minShapes=INPUT0:1x256,INPUT1:1x256 --shapes=INPUT0:1x256,INPUT1:1x256 --saveEngine=../models/tensorrt_fp16_tf/1/model.plan --fp16

In [None]:
!curl -v 0.0.0.0:8000/v2/models/tensorrt_fp16

In [None]:
configuration = """
name: "tensorrt_fp16_tf"
platform: "tensorrt_plan"
max_batch_size: 32

input [
 {
    name: "INPUT0"
    data_type: TYPE_INT32
    dims: [ 256 ]
  } ,
{
    name: "INPUT1"
    data_type: TYPE_INT32
    dims: [ 256 ]
  }
]
output {
    name: "OUTPUT0"
    data_type: TYPE_FP32
    dims: [ 28 ]
}
"""


with open('../models/tensorrt_fp16/config.pbtxt', 'w') as f:
    f.write(configuration)

In [None]:
VERBOSE = False
input_name = ['INPUT0', 'INPUT1']
input_dtype = ['INT32', 'INT32']
output_name = ['OUTPUT0']
model_name = 'tensorrt_fp16_tf'
url = '0.0.0.0:8000'
model_version = '1'

In [None]:
tokenizer = AutoTokenizer.from_pretrained('../weights/')
text = 'I feel lucky to be here.'

In [None]:
with tritonhttpclient.InferenceServerClient(url=url, verbose=False) as client:
    # Encode the data using tokenizer
    inputs = tokenizer(text, return_tensors="pt", max_length=256, padding='max_length')
    input_ids = np.array(inputs['input_ids'], dtype=np.int32)
    attention_mask = np.array(inputs['attention_mask'], dtype=np.int32)
    tick = time.time()
    
    # Define input config
    inputs = [
        tritonhttpclient.InferInput(input_name[0], input_ids.shape, input_dtype[0]),
        tritonhttpclient.InferInput(input_name[1], attention_mask.shape, input_dtype[1]),
    ]
    
    # Attach input
    inputs[0].set_data_from_numpy(input_ids)
    inputs[1].set_data_from_numpy(attention_mask)
    
    # Define output config
    outputs = [
        tritonhttpclient.InferRequestedOutput(output_name[0]),
    ]
    
    # Hit triton server
    response = client.infer(model_name, model_version=model_version, inputs=inputs, outputs=outputs)
    logits = response.as_numpy(output_name[0])
    tock = time.time()
    print(f'Time taken: {tock - tick}')

In [None]:
id2label[str(logits.argmax())]