# Inferencing with `triton` server with `FIL` backend running on EKS

Here we will use a `gRPC` client and a `http` client to perform inference with triton server with a custom FIL backend. 

In [None]:
import os
import numpy
import subprocess
import sys
import time
import tritonclient.http as triton_http
import tritonclient.grpc as triton_grpc

### Step 1. Get the host ip and ports

In [None]:
http_port_cmd = "kubectl -n istio-system get service istio-ingressgateway -o jsonpath='{.spec.ports[?(@.name=='http2')].port}'"
grpc_port_cmd = "kubectl -n istio-system get service istio-ingressgateway -o jsonpath='{.spec.ports[?(@.name=='tcp')].port}'"
host_cmd = "kubectl -n istio-system get service istio-ingressgateway -o jsonpath='{.status.loadBalancer.ingress[0].hostname}'"

http_port = subprocess.check_output(http_port_cmd.split()).decode('utf-8').replace("'", "")
grpc_port = subprocess.check_output(grpc_port_cmd.split()).decode('utf-8').replace("'", "")
host = subprocess.check_output(host_cmd.split()).decode('utf-8').replace("'", "")

print(host, http_port)
print(host, grpc_port)


### Step 2. Generate some dummy data to use on `xgboost` model

In [None]:
# Set up both HTTP and GRPC clients. Note that the GRPC client is generally
# somewhat faster.

# Generate dummy data to classify
features = 32
samples = 8_000
data = numpy.random.rand(samples, features).astype('float32')

### Perform inference

#### 2.a. HTTP Client Example

In [None]:
http_client = triton_http.InferenceServerClient(
    url=f'{host}:{http_port}',
    verbose=False,
    concurrency=12
)

while (not (http_client.is_server_ready() or http_client.is_model_ready('xgb_model'))):
    print("Waiting on server ready")
    time.sleep(5)
print(f"Is Server Ready: {http_client.is_server_ready()}")
print(f"Is FIL model ready: {http_client.is_model_ready('xgb_model')}")

In [None]:
%%time
# Set up Triton input and output objects for both HTTP and GRPC
triton_input_http = triton_http.InferInput(
    'input__0',
    (samples, features),
    'FP32'
)

triton_input_http.set_data_from_numpy(data, binary_data=True)
triton_output_http = triton_http.InferRequestedOutput(
    'output__0',
    binary_data=True
)

# Submit inference requests (both HTTP and GRPC)
request_http = http_client.infer(
    'xgb_model',
    model_version='1',
    inputs=[triton_input_http],
    outputs=[triton_output_http]
)

In [None]:
result_http = request_http.as_numpy('output__0')
result_http

#### 2.b gRPC Client Example

In [None]:
grpc_client = triton_grpc.InferenceServerClient(
    url=f'{host}:{grpc_port}',
    verbose = False
)

while (not (grpc_client.is_server_ready() or grpc_client.is_model_ready('xgb_model'))):
    print("Waiting on server ready")
    time.sleep(5)
    
print(f"Is Server Ready: {grpc_client.is_server_ready()}")
print(f"Is FIL model ready: {grpc_client.is_model_ready('xgb_model')}")

In [None]:
%%time 
triton_input_grpc = triton_grpc.InferInput(
    'input__0',
    [samples, features],
    'FP32'
)

triton_input_grpc.set_data_from_numpy(data)
triton_output_grpc = triton_grpc.InferRequestedOutput('output__0')

request_grpc = grpc_client.infer(
    'xgb_model',
    model_version='1',
    inputs=[triton_input_grpc],
    outputs=[triton_output_grpc]
)

In [None]:
result_grpc = request_grpc.as_numpy('output__0')
result_grpc

### Compare results between http and gRPC

In [None]:
# Check that we got the same result with both GRPC and HTTP
numpy.testing.assert_almost_equal(result_http, result_grpc)

In [None]:
http_client.get_inference_statistics('xgb_model')

### Get information on the other models in the Triton Inference Server

In [None]:
http_client.get_model_repository_index()