# Overall

How triton inference server is configured:

1. Choose the PyTorch model.
2. Choose the backend or platform you want to deploy your model to.
3. Set config and model checkpoints for the compiled model. The config will contain info about the backend/platform, input and output.
4. Check if triton has loaded it or not.
5. If loaded, define the input in tritonclient input wrapper and hit the API.

### Python backend

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import time

import os

os.environ['TOKENIZERS_PARALLELISM'] = 'False'

model_name = 'joeddav/distilbert-base-uncased-go-emotions-student'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=False)
model.eval()

inputs = tokenizer("I feel lucky to be here.", return_tensors="pt", max_length=256, padding='max_length')

tick = time.time()
with torch.inference_mode():
    logits = model(**inputs)
    
tock = time.time()
print(f'Time taken: {tock - tick}')

predicted_label = model.config.id2label[logits[0].argmax().item()]
predicted_label

  from .autonotebook import tqdm as notebook_tqdm


Time taken: 0.12365293502807617


'relief'

In [2]:
tokenizer.save_pretrained('../weights')
model.save_pretrained('../weights')

#### Setup configuration

1. https://github.com/triton-inference-server/python_backend/blob/main/examples/pytorch/model.py
2. https://github.com/triton-inference-server/python_backend/blob/main/examples/pytorch/config.pbtxt

In [3]:
!mkdir -p ../models/pytorch/1
!touch ../models/pytorch/config.pbtxt
!touch ../models/pytorch/1/model.py

## And create those files

#### Send request to server

We will be using tritonclient to hit the API. Check [installation instructions](https://github.com/triton-inference-server/client#download-using-python-package-installer-pip).

In [4]:
### Let's restart the notebook

import IPython
IPython.Application.instance().kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

In [None]:
!curl -v 0.0.0.0:8000/v2/health/ready

In [None]:
!curl -v 0.0.0.0:8000/v2/models/pytorch

In [3]:
import tritonclient.http as tritonhttpclient
from transformers import AutoTokenizer
import time
import numpy as np

import os
import json

os.environ['TOKENIZERS_PARALLELISM'] = 'False'

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
with open('../weights/config.json', 'r') as f:
    id2label = json.load(f)['id2label']

In [5]:
VERBOSE = False
input_name = ['INPUT0', 'INPUT1']
input_dtype = ['INT32', 'INT32']
output_name = ['OUTPUT0']
model_name = 'pytorch'
url = '0.0.0.0:8000'
model_version = '1'

In [6]:
tokenizer = AutoTokenizer.from_pretrained('../weights/')
text = 'I feel lucky to be here.'

In [17]:
with tritonhttpclient.InferenceServerClient(url=url, verbose=False) as client:
    # Encode the data using tokenizer
    inputs = tokenizer(text, return_tensors="pt", max_length=256, padding='max_length')
    input_ids = np.array(inputs['input_ids'], dtype=np.int32)
    attention_mask = np.array(inputs['attention_mask'], dtype=np.int32)
    tick = time.time()
    
    # Define input config
    inputs = [
        tritonhttpclient.InferInput(input_name[0], input_ids.shape, input_dtype[0]),
        tritonhttpclient.InferInput(input_name[1], attention_mask.shape, input_dtype[1]),
    ]
    
    # Attach input
    inputs[0].set_data_from_numpy(input_ids)
    inputs[1].set_data_from_numpy(attention_mask)
    
    # Define output config
    outputs = [
        tritonhttpclient.InferRequestedOutput(output_name[0]),
    ]
    
    # Hit triton server
    response = client.infer(model_name, model_version=model_version, inputs=inputs, outputs=outputs)
    logits = response.as_numpy(output_name[0])
    tock = time.time()
    print(f'Time taken: {tock - tick}')

Time taken: 0.18004775047302246


In [11]:
id2label[str(logits.argmax())]

'relief'

### Onnx backend

In [None]:
### Let's restart the notebook

import IPython
IPython.Application.instance().kernel.do_shutdown(True)

In [16]:
# !mkdir -p ../models/onnx/1
# !touch ../models/onnx/config.pbtxt

# And create those files

In [17]:
VERBOSE = False
input_name = ['INPUT0', 'INPUT1']
input_dtype = ['INT32', 'INT32']
output_name = ['OUTPUT0']
model_name = 'onnx'
url = '0.0.0.0:8000'
model_version = '1'

In [18]:
import torch
from torch import nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import time
import json
from onnxruntime import InferenceSession
import os
import numpy as np

os.environ['TOKENIZERS_PARALLELISM'] = 'False'


class OnnxModel(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=False)
    
    def forward(self, input_ids, attention_mask):
        return self.model(input_ids, attention_mask)[0]  # logits


model_name = '../weights'
tokenizer = AutoTokenizer.from_pretrained(model_name)
inputs = tokenizer("I feel lucky to be here.", return_tensors="pt", max_length=256, padding='max_length')

model = OnnxModel(model_name)
model.eval()


with torch.inference_mode():
    torch.onnx.export(model, (inputs['input_ids'].type(torch.int32), inputs['attention_mask'].type(torch.int32)), 
                      '../models/onnx/1/model.onnx', verbose=False, 
                      input_names=input_name, output_names=output_name, 
                      dynamic_axes={input_name[0]: {0: 'batch_size'}, input_name[1]: {0: 'batch_size'}, output_name[0]: {0: 'batch_size'}})


with open('../weights/config.json', 'r') as f:
    id2label = json.load(f)['id2label']

inputs = tokenizer("I feel lucky to be here.", return_tensors="np", max_length=256, padding='max_length')
session = InferenceSession("../models/onnx/1/model.onnx")

tick = time.time()
with torch.inference_mode():
    logits = session.run(output_names=output_name, input_feed={input_name[0]: inputs['input_ids'].astype(np.int32), input_name[1]: inputs['attention_mask'].astype(np.int32)})
    
tock = time.time()
print(f'Time taken: {tock - tick}')

id2label[str(logits[0][0].argmax().item())]

  mask, torch.tensor(torch.finfo(scores.dtype).min)


Time taken: 0.13427281379699707


'relief'

#### Send request to server

In [22]:
### Let's restart the notebook

import IPython
IPython.Application.instance().kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

In [None]:
!curl -v 0.0.0.0:8000/v2/health/ready

In [None]:
!curl -v 0.0.0.0:8000/v2/models/onnx

In [4]:
import tritonclient.http as tritonhttpclient
from transformers import AutoTokenizer
import time
import numpy as np

import os
import json

os.environ['TOKENIZERS_PARALLELISM'] = 'False'

In [5]:
VERBOSE = False
input_name = ['INPUT0', 'INPUT1']
input_dtype = ['INT32', 'INT32']
output_name = ['OUTPUT0']
model_name = 'onnx'
url = '0.0.0.0:8000'
model_version = '1'

In [6]:
with open('../weights/config.json', 'r') as f:
    id2label = json.load(f)['id2label']

In [7]:
tokenizer = AutoTokenizer.from_pretrained('../weights/')
text = 'I feel lucky to be here.'

In [9]:
with tritonhttpclient.InferenceServerClient(url=url, verbose=False) as client:
    # Encode the data using tokenizer
    inputs = tokenizer(text, return_tensors="pt", max_length=256, padding='max_length')
    input_ids = np.array(inputs['input_ids'], dtype=np.int32)
    attention_mask = np.array(inputs['attention_mask'], dtype=np.int32)
    tick = time.time()
    
    # Define input config
    inputs = [
        tritonhttpclient.InferInput(input_name[0], input_ids.shape, input_dtype[0]),
        tritonhttpclient.InferInput(input_name[1], attention_mask.shape, input_dtype[1]),
    ]
    
    # Attach input
    inputs[0].set_data_from_numpy(input_ids)
    inputs[1].set_data_from_numpy(attention_mask)
    
    # Define output config
    outputs = [
        tritonhttpclient.InferRequestedOutput(output_name[0]),
    ]
    
    # Hit triton server
    response = client.infer(model_name, model_version=model_version, inputs=inputs, outputs=outputs)
    logits = response.as_numpy(output_name[0])
    tock = time.time()
    print(f'Time taken: {tock - tick}')

Time taken: 0.12295889854431152


In [10]:
id2label[str(logits[0].argmax())]

'relief'

### TorchScript backend

In [18]:
### Let's restart the notebook

import IPython
IPython.Application.instance().kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

In [14]:
!mkdir -p ../models/torchscript/1
!touch ../models/torchscript/config.pbtxt

## And create those files

In [15]:
import torch
from torch import nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import time
import json
import os

os.environ['TOKENIZERS_PARALLELISM'] = 'False'


class TorchScriptModel(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=False)
    
    def forward(self, input_ids, attention_mask):
        return self.model(input_ids, attention_mask)[0]  # logits


model_name = '../weights'
tokenizer = AutoTokenizer.from_pretrained(model_name)
inputs = tokenizer("I feel lucky to be here.", return_tensors="pt", max_length=256, padding='max_length')

model = TorchScriptModel(model_name)
model.eval()
traced_script_module = torch.jit.trace(model, (inputs['input_ids'], inputs['attention_mask']))
traced_script_module.save('../models/torchscript/1/model.pt')

tick = time.time()
with torch.inference_mode():
    logits = traced_script_module(inputs['input_ids'], inputs['attention_mask'])
    
tock = time.time()
print(f'Time taken: {tock - tick}')

with open('../weights/config.json', 'r') as f:
    id2label = json.load(f)['id2label']

    
id2label[str(logits.argmax().item())]

Time taken: 0.16965603828430176


'relief'

#### Send request to server

In [16]:
### Let's restart the notebook

import IPython
IPython.Application.instance().kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

In [13]:
!curl -v 0.0.0.0:8000/v2/health/ready

*   Trying 0.0.0.0:8000...
* Connected to 0.0.0.0 (127.0.0.1) port 8000 (#0)
> GET /v2/health/ready HTTP/1.1
> Host: 0.0.0.0:8000
> User-Agent: curl/7.84.0
> Accept: */*
> 
* Mark bundle as not supporting multiuse
< HTTP/1.1 200 OK
< Content-Length: 0
< Content-Type: text/plain
< 
* Connection #0 to host 0.0.0.0 left intact


In [14]:
!curl -v 0.0.0.0:8000/v2/models/torchscript

*   Trying 0.0.0.0:8000...
* Connected to 0.0.0.0 (127.0.0.1) port 8000 (#0)
> GET /v2/models/torchscript HTTP/1.1
> Host: 0.0.0.0:8000
> User-Agent: curl/7.84.0
> Accept: */*
> 
* Mark bundle as not supporting multiuse
< HTTP/1.1 200 OK
< Content-Type: application/json
< Content-Length: 253
< 
* Connection #0 to host 0.0.0.0 left intact
{"name":"torchscript","versions":["1"],"platform":"pytorch_libtorch","inputs":[{"name":"INPUT0","datatype":"INT32","shape":[-1,256]},{"name":"INPUT1","datatype":"INT32","shape":[-1,256]}],"outputs":[{"name":"OUTPUT0","datatype":"FP32","shape":[-1,28]}]}

In [15]:
import tritonclient.http as tritonhttpclient
from transformers import AutoTokenizer
import time
import numpy as np

import os
import json

os.environ['TOKENIZERS_PARALLELISM'] = 'False'

In [16]:
with open('../weights/config.json', 'r') as f:
    id2label = json.load(f)['id2label']

In [17]:
VERBOSE = False
input_name = ['INPUT0', 'INPUT1']
input_dtype = ['INT32', 'INT32']
output_name = ['OUTPUT0']
model_name = 'torchscript'
url = '0.0.0.0:8000'
model_version = '1'

In [18]:
tokenizer = AutoTokenizer.from_pretrained('../weights/')
text = 'I feel lucky to be here.'

In [43]:
with tritonhttpclient.InferenceServerClient(url=url, verbose=False) as client:
    # Encode the data using tokenizer
    inputs = tokenizer(text, return_tensors="pt", max_length=256, padding='max_length')
    input_ids = np.array(inputs['input_ids'], dtype=np.int32)
    attention_mask = np.array(inputs['attention_mask'], dtype=np.int32)
    tick = time.time()
    
    # Define input config
    inputs = [
        tritonhttpclient.InferInput(input_name[0], input_ids.shape, input_dtype[0]),
        tritonhttpclient.InferInput(input_name[1], attention_mask.shape, input_dtype[1]),
    ]
    
    # Attach input
    inputs[0].set_data_from_numpy(input_ids)
    inputs[1].set_data_from_numpy(attention_mask)
    
    # Define output config
    outputs = [
        tritonhttpclient.InferRequestedOutput(output_name[0]),
    ]
    
    # Hit triton server
    response = client.infer(model_name, model_version=model_version, inputs=inputs, outputs=outputs)
    logits = response.as_numpy(output_name[0])
    tock = time.time()
    print(f'Time taken: {tock - tick}')

Time taken: 0.12454485893249512


In [42]:
id2label[str(logits.argmax())]

'relief'