# Example Notebook to show how to use RAPIDS with Triton

### Client Setup

In [1]:
# !pip install nvidia-pyindex
# !pip install tritonclient[all]

### Import Libraries

In [2]:
import numpy as np
import json

import grpc
from tritonclient.grpc import service_pb2
from tritonclient.grpc import service_pb2_grpc
import tritonclient.grpc as grpcclient
from functools import partial

###  Connect to the Triton( Rapids CuDF)  Model 

In [3]:
url='localhost:8001'

triton_client = grpcclient.InferenceServerClient(url=url,verbose=False)

channel = grpc.insecure_channel(url)
grpc_stub = service_pb2_grpc.GRPCInferenceServiceStub(channel)

In [4]:
preprocessing_model = 'rapids_tokenizer'
request = service_pb2.ModelMetadataRequest(name=preprocessing_model,
                                           version='1')
response = grpc_stub.ModelMetadata(request)
print("model metadata:\n{}".format(response))

model metadata:
name: "rapids_tokenizer"
versions: "1"
platform: "python"
inputs {
  name: "raw_logs"
  datatype: "BYTES"
  shape: -1
}
outputs {
  name: "input_ids"
  datatype: "INT32"
  shape: -1
  shape: 256
}
outputs {
  name: "attention_mask"
  datatype: "INT32"
  shape: -1
  shape: 256
}
outputs {
  name: "metadata"
  datatype: "INT32"
  shape: -1
  shape: 3
}



## Send Request to Model 

### Prepare Input 

### Request Sending Function

In [5]:
log_ls = ['Test sentence 1', 'Test sentence 2', 'Test sentence 3', 'Test sentence 4', 'Test sentence 5']
log_ls = [l.encode('utf-8') for l in log_ls]
log_ar = np.array(log_ls).reshape(len(log_ls))

In [6]:
def callback(output, result, error):
    if error:
        output.append(error)
    else:
        output.append(result)


    
def send_preprocess_requet(log_ar, model_name='rapids_tokenizer'):
    triton_client = grpcclient.InferenceServerClient(url=url,verbose=False)
    input_grpc = grpcclient.InferInput("raw_logs",log_ar.shape,"BYTES")
    input_grpc.set_data_from_numpy(log_ar)
    outputs = []
    

    outputs.append(grpcclient.InferRequestedOutput('input_ids'))
    outputs.append(grpcclient.InferRequestedOutput('attention_mask'))
    outputs.append(grpcclient.InferRequestedOutput('metadata'))
    
    output = triton_client.infer(model_name=model_name,
                               inputs=[input_grpc],
                              outputs=outputs)
    
    # do below for async request
    # list to hold the results of inference.
    # output = []
    # triton_client.async_infer(model_name=model_name,
    #                   inputs=[input_grpc],
    #                   outputs=outputs,
    #                   callback=partial(callback, output),)   

    return output

In [7]:
%%timeit
output = send_preprocess_requet(log_ar)

2.62 ms ± 474 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [8]:
output = send_preprocess_requet(log_ar)

##  Outputs

In [9]:
input_ids = output.as_numpy('input_ids')
attention_mask = output.as_numpy('attention_mask')
metadata = output.as_numpy('metadata')
input_ids

array([[2774, 5650,  122, ...,    0,    0,    0],
       [2774, 5650,  123, ...,    0,    0,    0],
       [2774, 5650,  124, ...,    0,    0,    0],
       [2774, 5650,  125, ...,    0,    0,    0],
       [2774, 5650,  126, ...,    0,    0,    0]], dtype=uint32)

In [10]:
attention_mask

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=uint32)

In [11]:
metadata

array([[0, 0, 2],
       [1, 0, 2],
       [2, 0, 2],
       [3, 0, 2],
       [4, 0, 2]], dtype=uint32)