# Overall

How triton inference server is configured:

1. Choose the PyTorch model.
2. Choose the backend or platform you want to deploy your model to.
3. Set config and model checkpoints for the compiled model. The config will contain info about the backend/platform, input and output.
4. Check if triton has loaded it or not.
5. If loaded, define the input in tritonclient input wrapper and hit the API.

### Python backend

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import time

import os

os.environ['TOKENIZERS_PARALLELISM'] = 'False'

model_name = 'joeddav/distilbert-base-uncased-go-emotions-student'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=False)
model.eval()

inputs = tokenizer("I feel lucky to be here.", return_tensors="pt", max_length=256, padding='max_length')

tick = time.time()
with torch.inference_mode():
    logits = model(**inputs)
    
tock = time.time()
print(f'Time taken: {tock - tick}')

predicted_label = model.config.id2label[logits[0].argmax().item()]
predicted_label

In [None]:
tokenizer.save_pretrained('../weights')
model.save_pretrained('../weights')

#### Setup configuration

1. https://github.com/triton-inference-server/python_backend/blob/main/examples/pytorch/model.py
2. https://github.com/triton-inference-server/python_backend/blob/main/examples/pytorch/config.pbtxt

In [None]:
!mkdir -p ../models/pytorch/1
!touch ../models/pytorch/1/model.py

## And create those files

In [None]:
configuration = """
name: "pytorch"
backend: "python"
max_batch_size: 32


input [
 {
    name: "INPUT0"
    data_type: TYPE_INT32
    dims: [ 256 ]
  } ,
{
    name: "INPUT1"
    data_type: TYPE_INT32
    dims: [ 256 ]
  }
]
output {
    name: "OUTPUT0"
    data_type: TYPE_FP32
    dims: [ 28 ]
  }

instance_group [
 {
    count: 1
    kind: KIND_CPU
 }
]
"""


with open('../models/pytorch/config.pbtxt', 'w') as f:
    f.write(configuration)

#### Send request to server

We will be using tritonclient to hit the API. Check [installation instructions](https://github.com/triton-inference-server/client#download-using-python-package-installer-pip).

In [None]:
### Let's restart the notebook

import IPython
IPython.Application.instance().kernel.do_shutdown(True)

In [None]:
!curl -v 0.0.0.0:8000/v2/health/ready

In [None]:
!curl -v 0.0.0.0:8000/v2/models/pytorch

In [None]:
import tritonclient.http as tritonhttpclient
from transformers import AutoTokenizer
import time
import numpy as np

import os
import json

os.environ['TOKENIZERS_PARALLELISM'] = 'False'

In [None]:
with open('../weights/config.json', 'r') as f:
    id2label = json.load(f)['id2label']

In [None]:
VERBOSE = False
input_name = ['INPUT0', 'INPUT1']
input_dtype = ['INT32', 'INT32']
output_name = ['OUTPUT0']
model_name = 'pytorch'
url = '0.0.0.0:8000'
model_version = '1'

In [None]:
tokenizer = AutoTokenizer.from_pretrained('../weights/')
text = 'I feel lucky to be here.'

In [None]:
with tritonhttpclient.InferenceServerClient(url=url, verbose=False) as client:
    # Encode the data using tokenizer
    inputs = tokenizer(text, return_tensors="pt", max_length=256, padding='max_length')
    input_ids = np.array(inputs['input_ids'], dtype=np.int32)
    attention_mask = np.array(inputs['attention_mask'], dtype=np.int32)
    tick = time.time()
    
    # Define input config
    inputs = [
        tritonhttpclient.InferInput(input_name[0], input_ids.shape, input_dtype[0]),
        tritonhttpclient.InferInput(input_name[1], attention_mask.shape, input_dtype[1]),
    ]
    
    # Attach input
    inputs[0].set_data_from_numpy(input_ids)
    inputs[1].set_data_from_numpy(attention_mask)
    
    # Define output config
    outputs = [
        tritonhttpclient.InferRequestedOutput(output_name[0]),
    ]
    
    # Hit triton server
    response = client.infer(model_name, model_version=model_version, inputs=inputs, outputs=outputs)
    logits = response.as_numpy(output_name[0])
    tock = time.time()
    print(f'Time taken: {tock - tick}')

In [None]:
id2label[str(logits.argmax())]

### Onnx backend

In [None]:
### Let's restart the notebook

import IPython
IPython.Application.instance().kernel.do_shutdown(True)

In [None]:
!mkdir -p ../models/onnx/1

In [None]:
VERBOSE = False
input_name = ['INPUT0', 'INPUT1']
input_dtype = ['INT32', 'INT32']
output_name = ['OUTPUT0']
model_name = 'onnx'
url = '0.0.0.0:8000'
model_version = '1'

In [None]:
configuration = """
name: "onnx"
platform: "onnxruntime_onnx"
max_batch_size: 32

input [
  {
    name: "INPUT0"
    data_type: TYPE_INT32
    dims: [ 256 ]
  }
]
input [
  {
    name: "INPUT1"
    data_type: TYPE_INT32
    dims: [ 256 ]
  }
]
output [
  {
    name: "OUTPUT0"
    data_type: TYPE_FP32
    dims: [ 28 ]
  }
]
"""


with open('../models/onnx/config.pbtxt', 'w') as f:
    f.write(configuration)

In [None]:
import torch
from torch import nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import time
import json
from onnxruntime import InferenceSession
import os
import numpy as np

os.environ['TOKENIZERS_PARALLELISM'] = 'False'


class OnnxModel(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=False)
    
    def forward(self, input_ids, attention_mask):
        return self.model(input_ids, attention_mask)[0]  # logits


model_name = '../weights'
tokenizer = AutoTokenizer.from_pretrained(model_name)
inputs = tokenizer("I feel lucky to be here.", return_tensors="pt", max_length=256, padding='max_length')

model = OnnxModel(model_name)
model.eval()


with torch.inference_mode():
    torch.onnx.export(model, (inputs['input_ids'].type(torch.int32), inputs['attention_mask'].type(torch.int32)), 
                      '../models/onnx/1/model.onnx', verbose=False, 
                      input_names=input_name, output_names=output_name, 
                      dynamic_axes={input_name[0]: {0: 'batch_size'}, input_name[1]: {0: 'batch_size'}, output_name[0]: {0: 'batch_size'}})


with open('../weights/config.json', 'r') as f:
    id2label = json.load(f)['id2label']

inputs = tokenizer("I feel lucky to be here.", return_tensors="np", max_length=256, padding='max_length')
session = InferenceSession("../models/onnx/1/model.onnx")

tick = time.time()
with torch.inference_mode():
    logits = session.run(output_names=output_name, input_feed={input_name[0]: inputs['input_ids'].astype(np.int32), input_name[1]: inputs['attention_mask'].astype(np.int32)})
    
tock = time.time()
print(f'Time taken: {tock - tick}')

id2label[str(logits[0][0].argmax().item())]

#### Send request to server

In [None]:
### Let's restart the notebook

import IPython
IPython.Application.instance().kernel.do_shutdown(True)

In [None]:
!curl -v 0.0.0.0:8000/v2/health/ready

In [None]:
!curl -v 0.0.0.0:8000/v2/models/onnx

In [None]:
import tritonclient.http as tritonhttpclient
from transformers import AutoTokenizer
import time
import numpy as np

import os
import json

os.environ['TOKENIZERS_PARALLELISM'] = 'False'

In [None]:
VERBOSE = False
input_name = ['INPUT0', 'INPUT1']
input_dtype = ['INT32', 'INT32']
output_name = ['OUTPUT0']
model_name = 'onnx'
url = '0.0.0.0:8000'
model_version = '1'

In [None]:
with open('../weights/config.json', 'r') as f:
    id2label = json.load(f)['id2label']

In [None]:
tokenizer = AutoTokenizer.from_pretrained('../weights/')
text = 'I feel lucky to be here.'

In [None]:
with tritonhttpclient.InferenceServerClient(url=url, verbose=False) as client:
    # Encode the data using tokenizer
    inputs = tokenizer(text, return_tensors="pt", max_length=256, padding='max_length')
    input_ids = np.array(inputs['input_ids'], dtype=np.int32)
    attention_mask = np.array(inputs['attention_mask'], dtype=np.int32)
    tick = time.time()
    
    # Define input config
    inputs = [
        tritonhttpclient.InferInput(input_name[0], input_ids.shape, input_dtype[0]),
        tritonhttpclient.InferInput(input_name[1], attention_mask.shape, input_dtype[1]),
    ]
    
    # Attach input
    inputs[0].set_data_from_numpy(input_ids)
    inputs[1].set_data_from_numpy(attention_mask)
    
    # Define output config
    outputs = [
        tritonhttpclient.InferRequestedOutput(output_name[0]),
    ]
    
    # Hit triton server
    response = client.infer(model_name, model_version=model_version, inputs=inputs, outputs=outputs)
    logits = response.as_numpy(output_name[0])
    tock = time.time()
    print(f'Time taken: {tock - tick}')

In [None]:
id2label[str(logits[0].argmax())]

### TorchScript backend

In [None]:
### Let's restart the notebook

import IPython
IPython.Application.instance().kernel.do_shutdown(True)

In [None]:
!mkdir -p ../models/torchscript/1

## And create those files

In [None]:
configuration = """
name: "torchscript"
platform: "pytorch_libtorch"
max_batch_size: 32

input [
 {
    name: "INPUT0"
    data_type: TYPE_INT32
    dims: [ 256 ]
  } ,
{
    name: "INPUT1"
    data_type: TYPE_INT32
    dims: [ 256 ]
  }
]
output {
    name: "OUTPUT0"
    data_type: TYPE_FP32
    dims: [ 28 ]
  }

"""


with open('../models/torchscript/config.pbtxt', 'w') as f:
    f.write(configuration)

In [None]:
import torch
from torch import nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import time
import json
import os

os.environ['TOKENIZERS_PARALLELISM'] = 'False'


class TorchScriptModel(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=False)
    
    def forward(self, input_ids, attention_mask):
        return self.model(input_ids, attention_mask)[0]  # logits


model_name = '../weights'
tokenizer = AutoTokenizer.from_pretrained(model_name)
inputs = tokenizer("I feel lucky to be here.", return_tensors="pt", max_length=256, padding='max_length')

model = TorchScriptModel(model_name)
model.eval()
traced_script_module = torch.jit.trace(model, (inputs['input_ids'], inputs['attention_mask']))
traced_script_module.save('../models/torchscript/1/model.pt')

tick = time.time()
with torch.inference_mode():
    logits = traced_script_module(inputs['input_ids'], inputs['attention_mask'])
    
tock = time.time()
print(f'Time taken: {tock - tick}')

with open('../weights/config.json', 'r') as f:
    id2label = json.load(f)['id2label']

    
id2label[str(logits.argmax().item())]

#### Send request to server

In [None]:
### Let's restart the notebook

import IPython
IPython.Application.instance().kernel.do_shutdown(True)

In [None]:
!curl -v 0.0.0.0:8000/v2/health/ready

In [None]:
!curl -v 0.0.0.0:8000/v2/models/torchscript

In [None]:
import tritonclient.http as tritonhttpclient
from transformers import AutoTokenizer
import time
import numpy as np

import os
import json

os.environ['TOKENIZERS_PARALLELISM'] = 'False'

In [None]:
with open('../weights/config.json', 'r') as f:
    id2label = json.load(f)['id2label']

In [None]:
VERBOSE = False
input_name = ['INPUT0', 'INPUT1']
input_dtype = ['INT32', 'INT32']
output_name = ['OUTPUT0']
model_name = 'torchscript'
url = '0.0.0.0:8000'
model_version = '1'

In [None]:
tokenizer = AutoTokenizer.from_pretrained('../weights/')
text = 'I feel lucky to be here.'

In [None]:
with tritonhttpclient.InferenceServerClient(url=url, verbose=False) as client:
    # Encode the data using tokenizer
    inputs = tokenizer(text, return_tensors="pt", max_length=256, padding='max_length')
    input_ids = np.array(inputs['input_ids'], dtype=np.int32)
    attention_mask = np.array(inputs['attention_mask'], dtype=np.int32)
    tick = time.time()
    
    # Define input config
    inputs = [
        tritonhttpclient.InferInput(input_name[0], input_ids.shape, input_dtype[0]),
        tritonhttpclient.InferInput(input_name[1], attention_mask.shape, input_dtype[1]),
    ]
    
    # Attach input
    inputs[0].set_data_from_numpy(input_ids)
    inputs[1].set_data_from_numpy(attention_mask)
    
    # Define output config
    outputs = [
        tritonhttpclient.InferRequestedOutput(output_name[0]),
    ]
    
    # Hit triton server
    response = client.infer(model_name, model_version=model_version, inputs=inputs, outputs=outputs)
    logits = response.as_numpy(output_name[0])
    tock = time.time()
    print(f'Time taken: {tock - tick}')

In [None]:
id2label[str(logits.argmax())]

### Triton backend (GPU only)

#### Installation Guide

Check the README.

#### Create models

Let's create 2 plans, one for fp32 and other one for fp16 (faster and uses less memory).

#### FP32

In [None]:
!mkdir -p ../models/tensorrt_fp32/1

In [None]:
!trtexec --onnx=../models/onnx/1/model.onnx --optShapes=INPUT0:16x256,INPUT1:16x256 --maxShapes=INPUT0:32x256,INPUT1:32x256 --minShapes=INPUT0:1x256,INPUT1:1x256 --shapes=INPUT0:1x256,INPUT1:1x256 --saveEngine=../models/tensorrt_fp32/1/model.plan

In [None]:
!curl -v 0.0.0.0:8000/v2/health/ready

In [None]:
!curl -v 0.0.0.0:8000/v2/models/tensorrt_fp32

In [None]:
configuration = """
name: "tensorrt_fp32"
platform: "tensorrt_plan"
max_batch_size: 32

input [
 {
    name: "INPUT0"
    data_type: TYPE_INT32
    dims: [ 256 ]
  } ,
{
    name: "INPUT1"
    data_type: TYPE_INT32
    dims: [ 256 ]
  }
]
output {
    name: "OUTPUT0"
    data_type: TYPE_FP32
    dims: [ 28 ]
}
"""

with open('../models/tensorrt_fp32/config.pbtxt', 'w') as f:
    f.write(configuration)

In [None]:
import tritonclient.http as tritonhttpclient
from transformers import AutoTokenizer
import time
import numpy as np

import os
import json

os.environ['TOKENIZERS_PARALLELISM'] = 'False'

In [None]:
VERBOSE = False
input_name = ['INPUT0', 'INPUT1']
input_dtype = ['INT32', 'INT32']
output_name = ['OUTPUT0']
model_name = 'tensorrt_fp32'
url = '0.0.0.0:8000'
model_version = '1'

In [None]:
tokenizer = AutoTokenizer.from_pretrained('../weights/')
text = 'I feel lucky to be here.'

In [None]:
with tritonhttpclient.InferenceServerClient(url=url, verbose=False) as client:
    # Encode the data using tokenizer
    inputs = tokenizer(text, return_tensors="pt", max_length=256, padding='max_length')
    input_ids = np.array(inputs['input_ids'], dtype=np.int32)
    attention_mask = np.array(inputs['attention_mask'], dtype=np.int32)
    tick = time.time()
    
    # Define input config
    inputs = [
        tritonhttpclient.InferInput(input_name[0], input_ids.shape, input_dtype[0]),
        tritonhttpclient.InferInput(input_name[1], attention_mask.shape, input_dtype[1]),
    ]
    
    # Attach input
    inputs[0].set_data_from_numpy(input_ids)
    inputs[1].set_data_from_numpy(attention_mask)
    
    # Define output config
    outputs = [
        tritonhttpclient.InferRequestedOutput(output_name[0]),
    ]
    
    # Hit triton server
    response = client.infer(model_name, model_version=model_version, inputs=inputs, outputs=outputs)
    logits = response.as_numpy(output_name[0])
    tock = time.time()
    print(f'Time taken: {tock - tick}')

In [None]:
id2label[str(logits.argmax())]

#### FP16

In [None]:
!mkdir -p ../models/tensorrt_fp16/1


In [None]:
!trtexec --onnx=../models/onnx/1/model.onnx --optShapes=INPUT0:16x256,INPUT1:16x256 --maxShapes=INPUT0:32x256,INPUT1:32x256 --minShapes=INPUT0:1x256,INPUT1:1x256 --shapes=INPUT0:1x256,INPUT1:1x256 --saveEngine=../models/tensorrt_fp16/1/model.plan --fp16

In [None]:
!curl -v 0.0.0.0:8000/v2/models/tensorrt_fp16

In [None]:
configuration = """
name: "tensorrt_fp16"
platform: "tensorrt_plan"
max_batch_size: 32

input [
 {
    name: "INPUT0"
    data_type: TYPE_INT32
    dims: [ 256 ]
  } ,
{
    name: "INPUT1"
    data_type: TYPE_INT32
    dims: [ 256 ]
  }
]
output {
    name: "OUTPUT0"
    data_type: TYPE_FP32
    dims: [ 28 ]
}
"""


with open('../models/tensorrt_fp16/config.pbtxt', 'w') as f:
    f.write(configuration)

In [None]:
VERBOSE = False
input_name = ['INPUT0', 'INPUT1']
input_dtype = ['INT32', 'INT32']
output_name = ['OUTPUT0']
model_name = 'tensorrt_fp16'
url = '0.0.0.0:8000'
model_version = '1'

In [None]:
tokenizer = AutoTokenizer.from_pretrained('../weights/')
text = 'I feel lucky to be here.'

In [None]:
with tritonhttpclient.InferenceServerClient(url=url, verbose=False) as client:
    # Encode the data using tokenizer
    inputs = tokenizer(text, return_tensors="pt", max_length=256, padding='max_length')
    input_ids = np.array(inputs['input_ids'], dtype=np.int32)
    attention_mask = np.array(inputs['attention_mask'], dtype=np.int32)
    tick = time.time()
    
    # Define input config
    inputs = [
        tritonhttpclient.InferInput(input_name[0], input_ids.shape, input_dtype[0]),
        tritonhttpclient.InferInput(input_name[1], attention_mask.shape, input_dtype[1]),
    ]
    
    # Attach input
    inputs[0].set_data_from_numpy(input_ids)
    inputs[1].set_data_from_numpy(attention_mask)
    
    # Define output config
    outputs = [
        tritonhttpclient.InferRequestedOutput(output_name[0]),
    ]
    
    # Hit triton server
    response = client.infer(model_name, model_version=model_version, inputs=inputs, outputs=outputs)
    logits = response.as_numpy(output_name[0])
    tock = time.time()
    print(f'Time taken: {tock - tick}')

In [None]:
id2label[str(logits.argmax())]