In [2]:
!pip install -qU pip awscli boto3 sagemaker
!pip install nvidia-pyindex
!pip install tritonclient[http]

[0mLooking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0mLooking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0m

### To create the JIT Trace model you need

Torch version 1.12.1 Torch Vision 0.13.1 and Cuda library 11.3

Secondly you need a GPU instance to run the notebook - this has been tested on a ml.g4dn.xlarge which comes with 1 gpu

In [3]:
pip install torch==1.12.1+cu113 torchvision==0.13.1+cu113 --extra-index-url https://download.pytorch.org/whl/cu113

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com, https://download.pytorch.org/whl/cu113
[0mNote: you may need to restart the kernel to use updated packages.


In [4]:
import os 
import tarfile
import boto3, json, sagemaker, time
from sagemaker import get_execution_role
import concurrent.futures
import time

### OPTIONAL -- Create a JIT Traced model

#### Few points to note: The traced models is provided in the zip file which can be used as is
1. The Model after trace is now returing outputs like OUTPUT_0 , 1 etc
2. To change them to named outputs can be done and we can try post this issue gets resolved
3. To full create a jit traced model we will need to provide a sample inputs and hence for now we have created a scripted model
4. The TORCH and the TORCHSCRIPT libraries would need to match the container and hence we use the specific ones mentioned above

In [5]:
import torch
from pathlib import Path
print(torch.__version__)
import torchvision
print(torchvision.__version__)


1.12.1+cu113
0.13.1+cu113


In [6]:
torch.cuda.is_available()

True

**Convert the model into Serving mode**

In [7]:
original_model_path = Path("od-load-test-model/1/model.pth")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = torch.load(original_model_path)
model = model.to(device)
model = model.eval()

In [66]:
class JitDummy(torch.nn.Module):
    # Modify original model to take int8 inputs and return Tuple[Tensor] results
    def __init__(self, model):
        super().__init__()
        self.model = model
        self.model.eval()
        
    @torch.jit.ignore
    def debug_output(self, preds):
        test_o = [pred["boxes"] for pred in  preds]
        print(f"{type(test_o[0])}, {test_o[0]}, {type(test_o)},", flush=True)
        
    def forward(self, inp):
        orig_type = inp.dtype
         #return bboxes, labels, scores
        return (torch.ones((5, 3), dtype=orig_type), )  #torch.float16)


In [67]:
torch.float

torch.float32

In [68]:
class JitConcatWrapper(torch.nn.Module):
    # Modify original model to take int8 inputs and return Tuple[Tensor] results
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, inp):
        
        inp = inp.div(255)
        
        # this will make the jit model work with arbitrary batch sizes by splitting the input tensor along the batch axis
        _, preds = self.model([t.squeeze() for t in torch.split(inp,1)])
        
        bboxes = []
        labels = []
        scores = []
        img_idxs = []

        for idx, pred in enumerate(preds):
            num_detections = pred["labels"].shape[0]
            
            if num_detections == 0:
                continue
                
            else:
                bboxes.append(pred["boxes"])
                labels.append(pred["labels"])
                scores.append(pred["scores"])
                img_idxs.append(torch.full((num_detections,), idx, dtype=torch.int8))
        
        if len(labels) == 0: # return empty tensors if no detections in batch
            bboxes_out = torch.empty((0,4), dtype=torch.float32)
            labels_out = torch.empty((0,), dtype=torch.int64) 
            scores_out = torch.empty((0,), dtype=torch.float32)
            img_idxs_out = torch.empty((0,), dtype=torch.int8)
    
        else:
            bboxes_out = torch.cat(bboxes)
            labels_out = torch.cat(labels)
            scores_out = torch.cat(scores)
            img_idxs_out = torch.cat(img_idxs)
        
        return bboxes_out, labels_out, scores_out, img_idxs_out

In [69]:
class JitPadWrapper(torch.nn.Module):
    # Modify original model to take int8 inputs and return Tuple[Tensor] results
    def __init__(self, model):
        super().__init__()
        self.model = model
        self.device = next(model.parameters()).device
        self.box_pad = torch.tensor([-1, -1, -1, -1], dtype=torch.float32).to(device)
        self.label_pad = torch.tensor([-1], dtype=torch.int64).to(device)
        self.scores_pad = torch.tensor([-1], dtype=torch.float32).to(device)
        

    def forward(self, inp):
        
        inp = inp.div(255)
        
        # this will make the jit model work with arbitrary batch sizes by splitting the input tensor along the batch axis
        _, preds = self.model([t.squeeze() for t in torch.split(inp,1)])
        
        
        max_detections = max([pred["labels"].shape[0] for pred in preds])
        
        for pred in preds:
            num_detections = pred["labels"].shape[0]
            padding_amount = max_detections - num_detections
            
            if padding_amount == 0:
                continue
            else:
                pred["boxes"] = torch.cat([pred["boxes"], self.box_pad.expand(padding_amount, 4)])
                pred["labels"] = torch.cat([pred["labels"], self.label_pad.expand(padding_amount, )])
                pred["scores"] = torch.cat([pred["scores"], self.scores_pad.expand(padding_amount, )])
        
        bboxes = torch.stack([pred["boxes"] for pred in  preds])
        labels = torch.stack([pred["labels"] for pred in  preds])
        scores = torch.stack([pred["scores"] for pred in  preds])
        
        return bboxes, labels, scores

In [70]:
class JitWrapper(torch.nn.Module):
    # Modify original model to take int8 inputs and return Tuple[Tensor] results
    def __init__(self, model):
        super().__init__()
        self.model = model
        self.model.eval()
        
    @torch.jit.ignore
    def debug_output(self, preds):
        test_o = [pred["boxes"] for pred in  preds]
        print(f"{type(test_o[0])}, {test_o[0]}, {type(test_o)},", flush=True)
        
    def forward(self, inp):
        orig_type = inp.dtype
        inp = inp.div(255)
        local_device = torch.device('cuda') #if torch.cuda.is_available() else 'cpu')
        
        with torch.no_grad():
        #with torch.autocast(device_type='cuda'):
        
            # preds_old = [{
            #     'boxes':torch.ones((5,3), dtype=torch.float32),  # to match the config.pbtxt
            #     'labels':torch.ones((5,3), dtype=torch.int64), 
            #     'scores':torch.ones((5,3), dtype=torch.float32)
            # }] # - # -  type preds 'List[Dict[str, Tensor]]'.
            # this will make the jit model work with arbitrary batch sizes by splitting the input tensor along the batch axis
            _, preds = self.model([t.squeeze() for t in torch.split(inp,1)])

             # -  type preds 'List[Dict[str, Tensor]]'.
            #if not torch.jit.is_scripting(): 

            bboxes = torch.stack([pred["boxes"] for pred in  preds])
            labels = torch.stack([pred["labels"] for pred in  preds])
            scores = torch.stack([pred["scores"] for pred in  preds])
            
            bboxes = torch.stack([torch.nn.utils.rnn.pad_sequence([pred["boxes"] for pred in  preds])  ])
            labels = torch.stack([torch.nn.utils.rnn.pad_sequence([pred["labels"] for pred in  preds]) ])
            scores = torch.stack([torch.nn.utils.rnn.pad_sequence([pred["scores"] for pred in  preds]) ])
            print(f"bboxes.shape={bboxes.shape}:")
        
        return bboxes, labels, scores
        #return torch.ones((5, 3), dtype=torch.float) #torch.float16)


#### Tracing fails because the output of the model in _preds is nothing

In [71]:
#traced_wrapper = JitWrapper(model) 
#traced_wrapper = JitDummy(model)
traced_wrapper = JitConcatWrapper(model)
#traced_wrapper = JitPadWrapper(model)


local_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(local_device)
image_array_tensors = torch.tensor(image_batch_local,device=local_device,dtype=torch.float)#.cuda()
#image_array_tensors = torch.tensor(image_batch,device=local_device,dtype=torch.float)#.cuda()
print(image_array_tensors.shape, image_array_tensors[0].shape)

with torch.no_grad():
    traced_model = torch.jit.trace(traced_wrapper, (image_array_tensors))#image_array_tensors)
    
print(traced_model.code)

cuda
torch.Size([12, 3, 480, 856]) torch.Size([3, 480, 856])


ValueError: too many values to unpack (expected 2)

#### Script works since output is at run time

In [99]:
#traced_wrapper = JitWrapper(model) 
#traced_wrapper = JitDummy(model)
#traced_wrapper = JitConcatWrapper(model) # similiar to JitWrapper
traced_wrapper = JitPadWrapper(model)

jit_model = torch.jit.script(wrapped_model)
print(jit_model.code)

def forward(self,
    inp: Tensor) -> Tuple[Tensor, Tensor, Tensor]:
  _0 = __torch__.torch.nn.utils.rnn.pad_sequence
  inp0 = torch.div(inp, 255)
  _1 = __torch__.torch.autograd.grad_mode.no_grad.__new__(__torch__.torch.autograd.grad_mode.no_grad)
  _2 = (_1).__init__()
  with _1:
    model = self.model
    _3 = annotate(List[Tensor], [])
    _4 = torch.split(inp0, 1)
    for _5 in range(torch.len(_4)):
      t = _4[_5]
      _6 = torch.append(_3, torch.squeeze(t))
    _7, preds, = (model).forward(_3, None, )
    _8 = annotate(List[Tensor], [])
    for _9 in range(torch.len(preds)):
      pred = preds[_9]
      _10 = torch.append(_8, pred["boxes"])
    bboxes = torch.stack([_0(_8, False, 0., )])
    _11 = annotate(List[Tensor], [])
    for _12 in range(torch.len(preds)):
      pred0 = preds[_12]
      _13 = torch.append(_11, pred0["labels"])
    labels = torch.stack([_0(_11, False, 0., )])
    _14 = annotate(List[Tensor], [])
    for _15 in range(torch.len(preds)):
      pred1 = preds

In [100]:
!mkdir -p jit-resnet-v3-model
!mkdir -p jit-resnet-v3-model/1
!rm jit-resnet-v3-model/1/model.pt

rm: cannot remove 'jit-resnet-v3-model/1/model.pt': No such file or directory


In [101]:
jit_model.save("./jit-resnet-v3-model/1/model.pt")
#torch.jit.save(jit_model, "./jit-resnet-v3-model/1/model.pt") 
#traced_model.save("jit-resnet-v3-model/1/model.pt")

In [103]:
!ls -alrt jit-resnet-v3-model/1

total 133560
drwxr-xr-x 3 root root      6144 Nov 25 05:16 ..
drwxr-xr-x 2 root root      6144 Dec  1 20:20 .ipynb_checkpoints
drwxr-xr-x 3 root root      6144 Dec  1 20:21 .
-rw-r--r-- 1 root root 136753044 Dec  1 20:21 model.pt


In [104]:
!rm -r jit-resnet-v3-model/1/.ip*
!ls -alrt jit-resnet-v3-model/1

total 133556
drwxr-xr-x 3 root root      6144 Nov 25 05:16 ..
-rw-r--r-- 1 root root 136753044 Dec  1 20:21 model.pt
drwxr-xr-x 2 root root      6144 Dec  1 20:21 .


In [105]:
%%writefile jit-resnet-v3-model/config.pbtxt
name: "jit-resnet-v3-model"
platform: "pytorch_libtorch"
max_batch_size: 12
input {
  name: "INPUT__0"
  data_type: TYPE_UINT8
  dims: [3,480,856]
}
output {
  name: "OUTPUT__0"
  data_type: TYPE_FP32
  dims: [-1]
}
output {
  name: "OUTPUT__1"
  data_type: TYPE_INT64
  dims: [-1]
}

output {
  name: "OUTPUT__2"
  data_type: TYPE_FP32
  dims: [-1]
}

instance_group {
  count: 3
  kind: KIND_GPU
}

Overwriting jit-resnet-v3-model/config.pbtxt


#### Now create the tar ball and upload 

In [106]:
model_path = "jit-resnet-v3-model"

output_filename = f"jit-resnet-v17-model.tar.gz"
with tarfile.open(output_filename, "w:gz") as tar:
    tar.add(model_path, arcname=model_path)
sm_client = boto3.client(service_name="sagemaker")
sagemaker_session = sagemaker.Session(boto_session=boto3.Session())
role = get_execution_role()

model_uri = sagemaker_session.upload_data(path=output_filename, key_prefix="temp_model")
print(model_uri)
os.remove(output_filename)

s3://sagemaker-eu-west-1-225730023796/temp_model/jit-resnet-v17-model.tar.gz


In [109]:
print(len(buffer[0]))
st_time = time.time()

result = invoke_endpoint(image_batch, m_name, 'jit-resnet-v17-model.tar.gz') #'model.tar.gz') # 'resnet_fpn_v2.tar.gz' # - 'jit-resnet-v5-model.tar.gz' ) # passing in a numpy aray already 
#result = invoke_endpoint(torch.tensor(image_batch_local[0]).unsqueeze(0).numpy(), m_name, 'jit-resnet-v11-model.tar.gz')

print(len(buffer),time.time() - st_time)
print(f"Test finished for 1 batch of {batch_size} images::result={result}::")
 

480
(12, 3, 480, 856)


ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (400) from primary with message "{"error":"[request id: <id_unknown>] failed to split the output tensor 'OUTPUT__0' in responses: expected batch size of atleast 12 in model output, got 1"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/resnet-fpn-v2-2022-11-25-19-22-56-057 in account 225730023796 for more information.

In [110]:
inputs = []
outputs = []
#input_data = torch.tensor(image_batch_local[0]).unsqueeze(0).numpy()
input_data = torch.tensor(image_batch[0]).unsqueeze(0).numpy()
print(input_data.shape)
    
runtime_sm_client = boto3.client('sagemaker-runtime')
#inputs.append(httpclient.InferInput("INPUT__0", [ len(input_data),h, w,3], "UINT8"))
    
inputs = [httpclient.InferInput("INPUT__0", input_data.shape, "UINT8")]
inputs[0].set_data_from_numpy(input_data, binary_data=True)
outputs_data = [httpclient.InferRequestedOutput(f"OUTPUT__{n}", binary_data=True) for n in range(3)]
    
request_body, header_length = httpclient.InferenceServerClient.generate_request_body(
        inputs, outputs=outputs_data
)


response_invoke = runtime_sm_client.invoke_endpoint(
        EndpointName=endpoint_name,
        ContentType="application/vnd.sagemaker-triton.binary+json;json-header-size={}".format(
            header_length
        ),
        Body=request_body,
        TargetModel='jit-resnet-v17-model.tar.gz',
        
)
header_length_prefix = "application/vnd.sagemaker-triton.binary+json;json-header-size="
header_length_str = response_invoke["ContentType"][len(header_length_prefix) :]
if not header_length_str:
        header_length_str='0'
#result = httpclient.InferenceServerClient.parse_response_body(response_invoke["Body"].read())
#result
binary_response = response_invoke["Body"].read()
binary_response

(1, 3, 480, 856)


b'{"model_name":"d85097bca17a6d3758be38f49b039fe3","model_version":"1","outputs":[{"name":"OUTPUT__0","datatype":"FP32","shape":[1,0,1,4],"parameters":{"binary_data_size":0}},{"name":"OUTPUT__1","datatype":"INT64","shape":[1,0,1],"parameters":{"binary_data_size":0}},{"name":"OUTPUT__2","datatype":"FP32","shape":[1,0,1],"parameters":{"binary_data_size":0}}]}'

In [111]:
httpclient.InferenceServerClient.parse_response_body(binary_response, verbose=True, content_encoding='utf8')

b'{"model_name":"d85097bca17a6d3758be38f49b039fe3","model_version":"1","outputs":[{"name":"OUTPUT__0","datatype":"FP32","shape":[1,0,1,4],"parameters":{"binary_data_size":0}},{"name":"OUTPUT__1","datatype":"INT64","shape":[1,0,1],"parameters":{"binary_data_size":0}},{"name":"OUTPUT__2","datatype":"FP32","shape":[1,0,1],"parameters":{"binary_data_size":0}}]}'


<tritonclient.http.InferResult at 0x7f6fce97f890>

In [20]:
#model_uri='s3://sagemaker-eu-west-1-225730023796/temp_model/model.tar.gz'
mme_path='s3://sagemaker-eu-west-1-225730023796/temp_model/'

In [17]:
from sagemaker.utils import name_from_base

#m_name =  name_from_base(f"{model_uri.rsplit('/')[-2]}")
m_name =  name_from_base(f"resnet-fpn-v2")

m_name

'resnet-fpn-v2-2022-11-25-19-22-56-057'

In [18]:
print(m_name)
model_uri.rsplit('/')[-2]

resnet-fpn-v2-2022-11-25-19-22-56-057


'temp_model'

In [19]:
#m_name = model_uri.rsplit("/")[-2] 

region = boto3.Session().region_name
image_account = '802834080501' # eu-west-1
base = "amazonaws.com"

#- {account_id}.dkr.ecr.{region}.{base}/sagemaker-tritonserver:22.07-py3
mme_triton_image_uri = (
    "{account_id}.dkr.ecr.{region}.{base}/sagemaker-tritonserver:22.08-py3".format(
        account_id=image_account, region=region, base=base
    )
)
triton_image_uri = mme_triton_image_uri

print(triton_image_uri)


container = {
    "Image": triton_image_uri, # "763104351884.dkr.ecr.eu-west-1.amazonaws.com/sagemaker-tritonserver:22.07-py3", # "785573368785.dkr.ecr.us-east-1.amazonaws.com/sagemaker-tritonserver:22.09-py3",
    "ModelDataUrl": mme_path, # model_uri,
    "Mode": "MultiModel",
    "Environment": {
        #"SAGEMAKER_TRITON_DEFAULT_MODEL_NAME": "resnet_fpn_v2",
        "SAGEMAKER_TRITON_THREAD_COUNT": "10", #"200",
        "SAGEMAKER_TRITON_BUFFER_MANAGER_THREAD_COUNT": "5", #"10"
        #"SAGEMAKER_TRITON_SHM_DEFAULT_BYTE_SIZE" : "20000000", #"1677721600", #"16777216000", "16777216"
        #"SAGEMAKER_TRITON_SHM_GROWTH_BYTE_SIZE": "1048576"

    },
}
sm_client = boto3.client(service_name="sagemaker")
try:
    create_model_response = sm_client.create_model(
        ModelName=m_name , 
        ExecutionRoleArn=get_execution_role(), 
        PrimaryContainer=container
    )
    print(create_model_response)
    
    create_endpoint_config_response = sm_client.create_endpoint_config(
        EndpointConfigName=m_name,
        ProductionVariants=[
            {
                "InstanceType": "ml.g4dn.2xlarge",
                "InitialVariantWeight": 1,
                "InitialInstanceCount": 1,
                "ModelName": m_name ,
                "VariantName": "AllTraffic",
            }
        ],
        )
    print(create_endpoint_config_response)
except Exception as e:
    print(f"Error: {e}\n Trying to create endpoint")



response = sm_client.create_endpoint(
    EndpointName=m_name ,
    EndpointConfigName=m_name)
print(response)

802834080501.dkr.ecr.eu-west-1.amazonaws.com/sagemaker-tritonserver:22.08-py3
{'ModelArn': 'arn:aws:sagemaker:eu-west-1:225730023796:model/resnet-fpn-v2-2022-11-25-19-22-56-057', 'ResponseMetadata': {'RequestId': '9292f038-8541-4bd4-9a7c-5b737a0da87e', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '9292f038-8541-4bd4-9a7c-5b737a0da87e', 'content-type': 'application/x-amz-json-1.1', 'content-length': '99', 'date': 'Fri, 25 Nov 2022 19:23:08 GMT'}, 'RetryAttempts': 0}}
{'EndpointConfigArn': 'arn:aws:sagemaker:eu-west-1:225730023796:endpoint-config/resnet-fpn-v2-2022-11-25-19-22-56-057', 'ResponseMetadata': {'RequestId': 'e7735e1d-c6c4-482d-a2dc-cbfc213bfc0d', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'e7735e1d-c6c4-482d-a2dc-cbfc213bfc0d', 'content-type': 'application/x-amz-json-1.1', 'content-length': '118', 'date': 'Fri, 25 Nov 2022 19:23:08 GMT'}, 'RetryAttempts': 0}}
{'EndpointArn': 'arn:aws:sagemaker:eu-west-1:225730023796:endpoint/resnet-fpn-v2-2022-11

In [20]:
response

{'EndpointArn': 'arn:aws:sagemaker:eu-west-1:225730023796:endpoint/resnet-fpn-v2-2022-11-25-19-22-56-057',
 'ResponseMetadata': {'RequestId': 'f3ca662a-a0b4-4dc6-804a-03673c324d0e',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'f3ca662a-a0b4-4dc6-804a-03673c324d0e',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '105',
   'date': 'Fri, 25 Nov 2022 19:23:08 GMT'},
  'RetryAttempts': 0}}

In [22]:
old_m_name = m_name

In [22]:
import time
resp = sm_client.describe_endpoint(EndpointName=m_name)
status = resp["EndpointStatus"]
print("SINGLE:Model:endpoint:Triton:Status: " + status)

while status == "Creating":
    time.sleep(30)
    resp = sm_client.describe_endpoint(EndpointName=m_name)
    status = resp["EndpointStatus"]
    print("Single:model:triton:Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Single:model:triton:Status: " + status)

SINGLE:Model:endpoint:Triton:Status: Creating
Single:model:triton:Status: Creating
Single:model:triton:Status: Creating
Single:model:triton:Status: Creating
Single:model:triton:Status: Creating
Single:model:triton:Status: Creating
Single:model:triton:Status: Creating
Single:model:triton:Status: Creating
Single:model:triton:Status: InService
Arn: arn:aws:sagemaker:eu-west-1:225730023796:endpoint/resnet-fpn-v2-2022-11-25-19-22-56-057
Single:model:triton:Status: InService


#### - Clean up 

In [82]:
if 1==2:
    try:
        sm_client.delete_endpoint(EndpointName=old_m_name)
    except:
        pass
    sm_client.delete_endpoint_config(EndpointConfigName=old_m_name)
    sm_client.delete_model(ModelName=old_m_name)

In [23]:
import requests
from io import BytesIO
from PIL import Image
import concurrent.futures
import tritonclient.http as httpclient
from botocore.config import Config
import numpy as np
import random
import boto3
import time



w,h = 856,480

urls = [
    "https://m.media-amazon.com/images/M/MV5BNDcwZDc2NTEtMzU0Ni00YTQyLWIyYTQtNTI3YjM0MzhmMmI4XkEyXkFqcGdeQXVyNTgyNTA4MjM@._V1_.jpg",
    "https://lh3.googleusercontent.com/05JfZ1ZdyzrRNvhJosUFdcjjJRFE7k2KhmeM2ujqeCbrcrCb1hkq7O_JdUBpQ3r9hi0YeSn4WgmKx3Ai8LHdM2SucxSzl9TRZ4fCAqETJ6WtHgE=s0",
    "https://assets.nintendo.com/image/upload/f_auto/q_auto/dpr_2.625/c_scale,w_400/ncom/en_US/games/switch/n/new-pokemon-snap-switch/hero",
    "https://images.nintendolife.com/d358c9f9118af/pokemon-go.900x.jpg",
    "https://cdn.vox-cdn.com/thumbor/IKt535q8LMnJDddmLL74TBtzv88=/0x266:1024x949/1280x854/cdn.vox-cdn.com/uploads/chorus_image/image/48942277/N3DS_PokemonSuperMysteryDungeon_MainIllustration_png_jpgcopy.0.0.jpg",
    "https://i.imgflip.com/3sn9mp.jpg",
    "https://techcrunch.com/wp-content/uploads/2017/08/cbsn.png"
]



endpoint_name = m_name
def read_image(i=0):
    url = random.choice(urls)
    
    response = requests.get(url)
    img = Image.open(BytesIO(response.content))

    img = img.resize((w, h), Image.ANTIALIAS)
    return np.asarray(img, dtype='uint8')

def read_local_image(i=0,img_path='./shiba_inu_dog.jpg'):
    img=Image.open(img_path)

    img = img.resize((w, h), Image.ANTIALIAS)
    return np.asarray(img, dtype='uint8')


In [24]:
batch_size = 12
samples = 1
buffer = []
with concurrent.futures.ThreadPoolExecutor() as executor:
    images_future = [executor.submit(read_image, i) for i in range(samples*batch_size)]

    for i, future in enumerate(concurrent.futures.as_completed(images_future)):
        buffer.append(future.result())

print(len(buffer))

batch_size = 12
samples = 1
buffer_local = []
with concurrent.futures.ThreadPoolExecutor() as executor:
    images_future = [executor.submit(read_local_image, i, './shiba_inu_dog.jpg') for i in range(samples*batch_size)]

    for i, future in enumerate(concurrent.futures.as_completed(images_future)):
        buffer_local.append(future.result())

print(len(buffer_local))



12




12


In [25]:
image_batch = np.asarray(buffer, dtype='uint8').transpose(0,3,1,2)  # -- 12 x 3 x 480 x 856 
print(len(buffer[0]))
print(image_batch.shape)

image_batch_local = np.asarray(buffer_local, dtype='uint8').transpose(0,3,1,2)  # -- 12 x 3 x 480 x 856 
print(len(buffer_local[0]))
print(image_batch_local.shape)



480
(12, 3, 480, 856)
480
(12, 3, 480, 856)


## Torch Tensor manipulations

In [26]:
image_batch_local[0].shape # - (3, 480, 856)
torch.tensor(image_batch_local[0]).unsqueeze(0).shape # - (1, 3, 480, 856)

print(torch.tensor(image_batch_local[0]).unsqueeze(0).numpy().shape) # - (1, 3, 480, 856) -- but is of type numpy
print(type(torch.tensor(image_batch_local[0]).unsqueeze(0).numpy()) )

(1, 3, 480, 856)
<class 'numpy.ndarray'>


In [27]:
def invoke_endpoint(images,endpoint_name, target_model='model.tar.gz'): # - resnet_fpn_v3.tar.gz
    inputs = []
    outputs = []
    input_data = images # np.asarray(images, dtype='uint8')  # passing in a numpy aray already 
    print(input_data.shape)
    
    #inputs.append(httpclient.InferInput("INPUT__0", [ len(input_data),h, w,3], "UINT8"))
    
    inputs = [httpclient.InferInput("INPUT__0", images.shape, "UINT8")]
    inputs[0].set_data_from_numpy(images, binary_data=True)
    outputs = [httpclient.InferRequestedOutput(f"OUTPUT__{n}", binary_data=True) for n in range(3)]
    
    request_body, header_length = httpclient.InferenceServerClient.generate_request_body(
        inputs, outputs=outputs
    )


    runtime_sm_client = boto3.client(
        "sagemaker-runtime",
        region_name="eu-west-1", 
        config=Config(
            connect_timeout=5,
            read_timeout=60, #120,
            retries={'max_attempts': 2,'mode': 'standard'} #20
        )
    )
    
    response = runtime_sm_client.invoke_endpoint(
        EndpointName=endpoint_name,
        ContentType="application/vnd.sagemaker-triton.binary+json;json-header-size={}".format(
            header_length
        ),
        Body=request_body,
        TargetModel=target_model,
        
    )
    header_length_prefix = "application/vnd.sagemaker-triton.binary+json;json-header-size="
    header_length_str = response["ContentType"][len(header_length_prefix) :]
    if not header_length_str:
        header_length_str='0'
    result = httpclient.InferenceServerClient.parse_response_body(response["Body"].read())
    return result


In [28]:
print(len(buffer[0]))
st_time = time.time()
#result = invoke_endpoint(buffer, m_name)
result = invoke_endpoint(image_batch_local, m_name, 'jit-resnet-v2-model.tar.gz') #'model.tar.gz') # 'resnet_fpn_v2.tar.gz' # - 'jit-resnet-v5-model.tar.gz' ) # passing in a numpy aray already 
print(len(buffer),time.time() - st_time)
print(f"Test finished for 1 batch of {batch_size} images::result={result}::")
            

    

480
(12, 3, 480, 856)
12 9.515902280807495
Test finished for 1 batch of 12 images::result=<tritonclient.http.InferResult object at 0x7f6fcdc1cd90>::


In [29]:
result_outputs = result.get_response()['outputs']
for single_output in result_outputs:
    print(f"Each:output:{single_output}")
    print(f"Values:output:{result.as_numpy(single_output['name'])} \n")
    
# - 'shape': [12, 0, 4], 'parameters': {'binary_data_size': 0}}

Each:output:{'name': 'OUTPUT__0', 'datatype': 'FP32', 'shape': [12, 0, 4], 'parameters': {'binary_data_size': 0}}
Values:output:[] 

Each:output:{'name': 'OUTPUT__1', 'datatype': 'INT64', 'shape': [12, 0], 'parameters': {'binary_data_size': 0}}
Values:output:[] 

Each:output:{'name': 'OUTPUT__2', 'datatype': 'FP32', 'shape': [12, 0], 'parameters': {'binary_data_size': 0}}
Values:output:[] 



In [119]:
import numpy as np

print("Starting invocation for model:: please wait ...")
results = []
for i in range(0, 50):
    start = time.time()
    invoke_endpoint(buffer, m_name)
    results.append((time.time() - start) * 1000)
print("\nPredictions for model latency: \n")
print("\nP95: " + str(np.percentile(results, 95)) + " ms\n")
print("P90: " + str(np.percentile(results, 90)) + " ms\n")
print("Average: " + str(np.average(results)) + " ms\n")

Starting invocation for model:: please wait ...

Predictions for model latency: 


P95: 1914.787781238556 ms

P90: 1897.0021724700928 ms

Average: 1874.2761421203613 ms



In [120]:
print("\nPredictions for model latency: \n")
print("\nP95: " + str(np.percentile(results, 95)) + " ms\n")
print("P90: " + str(np.percentile(results, 90)) + " ms\n")
print("Average: " + str(np.average(results)) + " ms\n")


Predictions for model latency: 


P95: 1914.787781238556 ms

P90: 1897.0021724700928 ms

Average: 1874.2761421203613 ms



In [3]:
#m_name='time-od-model-2022-11-22-22-56-24-328'

In [64]:
from multiprocessing import cpu_count
print(m_name)
cpu_count()

time-od-model-2022-11-23-06-58-07-449


96

In [89]:
import numpy as np
import time
import traceback
def run_worker(proc_id):
    #print("Starting invocation for model:: please wait ...")
    start_worker = time.time()
    results = [0]
    error_count = 0
    total_count = 0
    
    while ( (time.time() - start_worker) < 180 ) : # -- 300 sec  -- 1 hour 3600    2 hour 7200 is 4 is 14400 
        start = time.time()
        try:
            total_count = total_count + 1
            invoke_endpoint(buffer, m_name)
            results.append((time.time() - start) * 1000)
        except:
            #print(traceback.format_exc())
            error_count = error_count + 1
            time.sleep(0.05)
            
        
        
    print(f"{np.percentile(results, 95)} ms:total_success_count={total_count}::error_count={error_count}::")
    return f"{np.percentile(results, 95)} ms:total_success_count={total_count}::error_count={error_count}::"
  

In [90]:
# create a process pool
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from multiprocessing import Pool
from multiprocessing import cpu_count

max_workers_cpu = 45 #cpu_count() # -*2
print(f"Max_A-Sync:processes={max_workers_cpu}")


Max_A-Sync:processes=45


#### Use a Thread Pool 

In [87]:

result_pool_list = []
print(max_workers_cpu)
with ThreadPoolExecutor(max_workers=(max_workers_cpu+10)) as pool:
# call a function 
    for worker in range(max_workers_cpu) :
        result_p = pool.submit(run_worker, worker )
        result_pool_list.append(result_p)
    
    for result_p in result_pool_list:
        result_p.result() # blocks



45
37972.13671207428 ms:total_success_count=46::error_count=45::
42923.21339845657 ms:total_success_count=48::error_count=47::
0.0 ms:total_success_count=42::error_count=42::
0.0 ms:total_success_count=41::error_count=41::
47835.4568362236 ms:total_success_count=47::error_count=46::
0.0 ms:total_success_count=44::error_count=44::
57578.9089679718 ms:total_success_count=45::error_count=44::
18160.48011779785 ms:total_success_count=46::error_count=45::
13640.869450569153 ms:total_success_count=45::error_count=44::
28061.84641122818 ms:total_success_count=46::error_count=45::
27536.960530281067 ms:total_success_count=44::error_count=43::
23365.378677845 ms:total_success_count=42::error_count=41::
27590.657460689545 ms:total_success_count=45::error_count=44::
22840.94465970993 ms:total_success_count=48::error_count=47::
37384.10577774048 ms:total_success_count=41::error_count=40::
33125.83919763565 ms:total_success_count=42::error_count=41::
42899.2857336998 ms:total_success_count=46::erro

#### Use a Multi Process Pool 

In [91]:

#with ProcessPoolExecutor(max_workers=max_workers_cpu) as executor:
result_pool_list = []
print(max_workers_cpu)
result_list = []
def async_call_back(result):
    result_list.append(result)  
    
with Pool(processes=max_workers_cpu) as pool:
# call a function 
    for worker in range(max_workers_cpu) :
        result_p = pool.apply_async(func=run_worker, args=(worker,) , callback = async_call_back)
        result_pool_list.append(result_p)
    
    for result_p in result_pool_list:
        result_p.wait()



45
0.0 ms:total_success_count=1::error_count=1::
0.0 ms:total_success_count=1::error_count=1::
0.0 ms:total_success_count=1::error_count=1::
0.0 ms:total_success_count=1::error_count=1::
0.0 ms:total_success_count=1::error_count=1::
0.0 ms:total_success_count=1::error_count=1::
0.0 ms:total_success_count=1::error_count=1::
0.0 ms:total_success_count=1::error_count=1::
0.0 ms:total_success_count=1::error_count=1::
0.0 ms:total_success_count=1::error_count=1::
0.0 ms:total_success_count=1::error_count=1::
0.0 ms:total_success_count=1::error_count=1::
5175.184786319733 ms:total_success_count=2::error_count=1::
5132.3278069496155 ms:total_success_count=2::error_count=1::
5136.127984523773 ms:total_success_count=2::error_count=1::
10011.859548091888 ms:total_success_count=2::error_count=1::
9993.257522583008 ms:total_success_count=2::error_count=1::
9991.812241077423 ms:total_success_count=2::error_count=1::
14862.797772884369 ms:total_success_count=2::error_count=1::
14865.903055667877 ms:

In [92]:
result_list

['0.0 ms:total_success_count=1::error_count=1::',
 '0.0 ms:total_success_count=1::error_count=1::',
 '0.0 ms:total_success_count=1::error_count=1::',
 '0.0 ms:total_success_count=1::error_count=1::',
 '0.0 ms:total_success_count=1::error_count=1::',
 '0.0 ms:total_success_count=1::error_count=1::',
 '0.0 ms:total_success_count=1::error_count=1::',
 '0.0 ms:total_success_count=1::error_count=1::',
 '0.0 ms:total_success_count=1::error_count=1::',
 '0.0 ms:total_success_count=1::error_count=1::',
 '0.0 ms:total_success_count=1::error_count=1::',
 '0.0 ms:total_success_count=1::error_count=1::',
 '5175.184786319733 ms:total_success_count=2::error_count=1::',
 '5132.3278069496155 ms:total_success_count=2::error_count=1::',
 '5136.127984523773 ms:total_success_count=2::error_count=1::',
 '10011.859548091888 ms:total_success_count=2::error_count=1::',
 '9993.257522583008 ms:total_success_count=2::error_count=1::',
 '9991.812241077423 ms:total_success_count=2::error_count=1::',
 '14862.797772

### Clean up 

In [53]:
sm_client = boto3.client(service_name="sagemaker")
try:
    sm_client.delete_endpoint(EndpointName=old_m_name)
except:
    pass
sm_client.delete_endpoint_config(EndpointConfigName=old_m_name)
sm_client.delete_model(ModelName=old_m_name)

{'ResponseMetadata': {'RequestId': '6a237969-ddc1-4e5a-9639-05e765beb30e',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '6a237969-ddc1-4e5a-9639-05e765beb30e',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Thu, 24 Nov 2022 01:10:11 GMT'},
  'RetryAttempts': 0}}

#### End timings test 

##### Image reading #####

In [150]:
import numpy as np
from PIL import Image

s3_client = boto3.client('s3')
s3_client.download_file(
    "sagemaker-sample-files",
    "datasets/image/pets/shiba_inu_dog.jpg",
    "shiba_inu_dog.jpg"
)

In [151]:
import tritonclient.http as httpclient

def get_sample_image():
    image_path = "./shiba_inu_dog.jpg"
    img = Image.open(image_path).convert("RGB")
    img = img.resize((224, 224))
    img = (np.array(img).astype(np.float32) / 255) - np.array(
        [0.485, 0.456, 0.406], dtype=np.float32
    ).reshape(1, 1, 3)
    img = img / np.array([0.229, 0.224, 0.225], dtype=np.float32).reshape(1, 1, 3)
    img = np.transpose(img, (2, 0, 1))
    return img.tolist()

def _get_sample_image_binary(input_name, output_name):
    inputs = []
    outputs = []
    inputs.append(httpclient.InferInput(input_name, [1, 3, 224, 224], "FP32"))
    input_data = np.array(get_sample_image(), dtype=np.float32)
    input_data = np.expand_dims(input_data, axis=0)
    inputs[0].set_data_from_numpy(input_data, binary_data=True)
    outputs.append(httpclient.InferRequestedOutput(output_name, binary_data=True))
    request_body, header_length = httpclient.InferenceServerClient.generate_request_body(
        inputs, outputs=outputs
    )
    return request_body, header_length


def get_sample_image_binary_pt():
    return _get_sample_image_binary("INPUT__0", "OUTPUT__0")


def get_sample_image_binary_trt():
    return _get_sample_image_binary("input", "output")

In [174]:
input_data = np.array(get_sample_image(), dtype=np.float32)
print(len(input_data))
print(input_data.shape)
print(type(input_data))
print(input_data[0][0][0]) # 3d array

input_data = np.expand_dims(input_data, axis=0)
print(len(input_data))
print(input_data.shape)
print(type(input_data))
print(input_data[0][0][0][0]) # 4d array

print("Trying triton http now \n")

inputs = [httpclient.InferInput('INPUT_0', [1, 3, 224, 224], "FP32")]
inputs[0].set_data_from_numpy(input_data, binary_data=True)
print(len(inputs))
print(inputs[0].shape())
print(type(inputs[0]))

3
(3, 224, 224)
<class 'numpy.ndarray'>
0.2281874
1
(1, 3, 224, 224)
<class 'numpy.ndarray'>
0.2281874
Trying triton http now 

1
[1, 3, 224, 224]
<class 'tritonclient.http.InferInput'>


In [175]:
inputs[0].name()

'INPUT_0'

In [None]:
    images = read_image()
    inputs = []
    outputs = []
    input_data = np.asarray(images, dtype='uint8')
    inputs.append(httpclient.InferInput("INPUT__0", [ len(input_data),h, w,3], "UINT8"))
    inputs[0].set_data_from_numpy(input_data, binary_data=True)
    outputs.append(httpclient.InferRequestedOutput("BBOX", binary_data=True))
    outputs.append(httpclient.InferRequestedOutput("LABELS", binary_data=True))
    outputs.append(httpclient.InferRequestedOutput("SCORES", binary_data=True))
    request_body, header_length = httpclient.InferenceServerClient.generate_request_body(
        inputs, outputs=outputs
    )


    runtime_sm_client = boto3.client("sagemaker-runtime",region_name="eu-west-1", config=Config(connect_timeout=5,
                                                                                 read_timeout=120,
                                                                                 retries={
                                                                                     'max_attempts': 20,
                                                                                     'mode': 'standard'

                                                                                 }))
    response = runtime_sm_client.invoke_endpoint(
        EndpointName=endpoint_name,
        ContentType="application/vnd.sagemaker-triton.binary+json;json-header-size={}".format(
            header_length
        ),
        Body=request_body,
        
    )
    header_length_prefix = "application/vnd.sagemaker-triton.binary+json;json-header-size="
    header_length_str = response["ContentType"][len(header_length_prefix) :]
    result = httpclient.InferenceServerClient.parse_response_body(
        response["Body"].read(), 
        header_length=int(header_length_str)
    )
    return result


In [19]:
%%writefile preprocess.py

import requests
from io import BytesIO
from PIL import Image
import concurrent.futures
import tritonclient.http as httpclient
from botocore.config import Config
import numpy as np
import random
import boto3
import time



w,h = 856,480

urls = [
    "https://m.media-amazon.com/images/M/MV5BNDcwZDc2NTEtMzU0Ni00YTQyLWIyYTQtNTI3YjM0MzhmMmI4XkEyXkFqcGdeQXVyNTgyNTA4MjM@._V1_.jpg",
    "https://lh3.googleusercontent.com/05JfZ1ZdyzrRNvhJosUFdcjjJRFE7k2KhmeM2ujqeCbrcrCb1hkq7O_JdUBpQ3r9hi0YeSn4WgmKx3Ai8LHdM2SucxSzl9TRZ4fCAqETJ6WtHgE=s0",
    "https://assets.nintendo.com/image/upload/f_auto/q_auto/dpr_2.625/c_scale,w_400/ncom/en_US/games/switch/n/new-pokemon-snap-switch/hero",
    "https://images.nintendolife.com/d358c9f9118af/pokemon-go.900x.jpg",
    "https://cdn.vox-cdn.com/thumbor/IKt535q8LMnJDddmLL74TBtzv88=/0x266:1024x949/1280x854/cdn.vox-cdn.com/uploads/chorus_image/image/48942277/N3DS_PokemonSuperMysteryDungeon_MainIllustration_png_jpgcopy.0.0.jpg",
    "https://i.imgflip.com/3sn9mp.jpg",
    "https://techcrunch.com/wp-content/uploads/2017/08/cbsn.png"
]



endpoint_name = "od-load-test-model"
def read_image(i):
    url = random.choice(urls)
    
    response = requests.get(url)
    img = Image.open(BytesIO(response.content))

    img = img.resize((w, h), Image.ANTIALIAS)
    return np.asarray(img, dtype='uint8')




def invoke_endpoint(images):
    inputs = []
    outputs = []
    input_data = np.asarray(images, dtype='uint8')
    inputs.append(httpclient.InferInput("INPUT__0", [ len(input_data),h, w,3], "UINT8"))
    inputs[0].set_data_from_numpy(input_data, binary_data=True)
    outputs.append(httpclient.InferRequestedOutput("BBOX", binary_data=True))
    outputs.append(httpclient.InferRequestedOutput("LABELS", binary_data=True))
    outputs.append(httpclient.InferRequestedOutput("SCORES", binary_data=True))
    request_body, header_length = httpclient.InferenceServerClient.generate_request_body(
        inputs, outputs=outputs
    )


    runtime_sm_client = boto3.client("sagemaker-runtime",region_name="eu-west-1", config=Config(connect_timeout=5,
                                                                                 read_timeout=120,
                                                                                 retries={
                                                                                     'max_attempts': 20,
                                                                                     'mode': 'standard'

                                                                                 }))
    response = runtime_sm_client.invoke_endpoint(
        EndpointName=endpoint_name,
        ContentType="application/vnd.sagemaker-triton.binary+json;json-header-size={}".format(
            header_length
        ),
        Body=request_body,
        
    )
    header_length_prefix = "application/vnd.sagemaker-triton.binary+json;json-header-size="
    header_length_str = response["ContentType"][len(header_length_prefix) :]
    result = httpclient.InferenceServerClient.parse_response_body(
        response["Body"].read(), 
        header_length=int(header_length_str)
    )
    return result



batch_size = 12
samples= 20

def run():
    j = 0 
    with concurrent.futures.ThreadPoolExecutor() as executor:
        while j < 1000:
            images_future = [executor.submit(read_image, i) for i in range(samples*batch_size)]
            buffer = []
            for i, future in enumerate(concurrent.futures.as_completed(images_future)):
                buffer.append(future.result())
                if len(buffer) >= batch_size:
                    st_time = time.time()
                    invoke_endpoint(buffer)
                    print(len(buffer),time.time() - st_time)
                    buffer.clear()
            j+=1
            

            
if __name__ == '__main__':
    print("start")
    run()
    print("end")

Writing preprocess.py


In [20]:
%%writefile Dockerfile
FROM python:3.10.8-buster

RUN pip install pillow tritonclient["http"] numpy requests futures tqdm boto3

Writing Dockerfile


In [22]:
!pip install sagemaker-studio-image-build

Collecting sagemaker-studio-image-build
  Downloading sagemaker_studio_image_build-0.6.0.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: sagemaker-studio-image-build
  Building wheel for sagemaker-studio-image-build (setup.py) ... [?25ldone
[?25h  Created wheel for sagemaker-studio-image-build: filename=sagemaker_studio_image_build-0.6.0-py3-none-any.whl size=13469 sha256=7a6070e8d3c911d366460556304285e370236be25ba58439465ad9c81846b74e
  Stored in directory: /root/.cache/pip/wheels/c1/9c/e8/cbf0266d9d9b1b6161f7ba9ddf572d02aacd411e8a5b4d186b
Successfully built sagemaker-studio-image-build
Installing collected packages: sagemaker-studio-image-build
Successfully installed sagemaker-studio-image-build-0.6.0
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m22.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip ins

In [36]:
!sm-docker build . 

...[Container] 2022/11/20 15:22:03 Waiting for agent ping

[Container] 2022/11/20 15:22:04 Waiting for DOWNLOAD_SOURCE
[Container] 2022/11/20 15:22:08 Phase is DOWNLOAD_SOURCE
[Container] 2022/11/20 15:22:08 CODEBUILD_SRC_DIR=/codebuild/output/src144861214/src
[Container] 2022/11/20 15:22:08 YAML location is /codebuild/output/src144861214/src/buildspec.yml
[Container] 2022/11/20 15:22:08 Setting HTTP client timeout to higher timeout for S3 source
[Container] 2022/11/20 15:22:08 Processing environment variables
[Container] 2022/11/20 15:22:08 No runtime version selected in buildspec.
[Container] 2022/11/20 15:22:08 Moving to directory /codebuild/output/src144861214/src
[Container] 2022/11/20 15:22:08 Configuring ssm agent with target id: codebuild:58d545b4-55dc-467a-b0c4-930fd06071c5
[Container] 2022/11/20 15:22:08 Successfully updated ssm agent configuration
[Container] 2022/11/20 15:22:08 Registering with agent
[Container] 2022/11/20 15:22:08 Phases found in YAML: 3
[Container] 2022/1

In [37]:
from sagemaker.processing import ScriptProcessor, ProcessingInput
import sagemaker

script_processor = ScriptProcessor(
    base_job_name="loadt-test-endpoints",
    image_uri="225730023796.dkr.ecr.eu-west-1.amazonaws.com/sagemaker-studio-d-gqidcsbwvhei:default-1663683956516", #"423151156806.dkr.ecr.eu-west-1.amazonaws.com/sagemaker-studio-d-v8zfns8jon6p:aviad",
    command=["python3"],
    role=sagemaker.get_execution_role(),
    instance_count=10,
    instance_type="ml.t3.medium",
    env={"mode": "python"},
)


script_processor.run(
    code="preprocess.py",
    wait=True,
    logs=True)


Job Name:  loadt-test-endpoints-2022-11-20-15-25-09-036
Inputs:  [{'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-eu-west-1-225730023796/loadt-test-endpoints-2022-11-20-15-25-09-036/input/code/preprocess.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  []
  img = img.resize((w, h), Image.ANTIALIAS)[0m
  img = img.resize((w, h), Image.ANTIALIAS)[0m
  img = img.resize((w, h), Image.ANTIALIAS)[0m
  img = img.resize((w, h), Image.ANTIALIAS)[0m
  img = img.resize((w, h), Image.ANTIALIAS)[0m
  img = img.resize((w, h), Image.ANTIALIAS)[0m
  img = img.resize((w, h), Image.ANTIALIAS)[0m
  img = img.resize((w, h), Image.ANTIALIAS)[0m
  img = img.resize((w, h), Image.ANTIALIAS)[0m
  img = img.resize((w, h), Image.ANTIALIAS)[0m
[34mstart[0m
[34mTraceback (most recent call last):
  File "/opt/ml/processing/input/code/p

UnexpectedStatusException: Error for Processing job loadt-test-endpoints-2022-11-20-15-25-09-036: Failed. Reason: AlgorithmError: See job logs for more information

In [77]:
a=""

In [5]:
pip install tritonclient[http]

Keyring is skipped due to an exception: 'keyring.backends'
Collecting tritonclient[http]
  Using cached tritonclient-2.27.0-py3-none-manylinux1_x86_64.whl (11.7 MB)
Collecting python-rapidjson>=0.9.1
  Using cached python_rapidjson-1.9-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
Collecting geventhttpclient<=2.0.2,>=1.4.4
  Using cached geventhttpclient-2.0.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (100 kB)
Collecting brotli
  Downloading Brotli-1.0.9-cp37-cp37m-manylinux1_x86_64.whl (357 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m357.2/357.2 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m00:01[0m
Installing collected packages: brotli, python-rapidjson, tritonclient, geventhttpclient
Successfully installed brotli-1.0.9 geventhttpclient-2.0.2 python-rapidjson-1.9 tritonclient-2.27.0
[0mNote: you may need to restart the kernel to use updated packages.


In [144]:
import requests
from io import BytesIO
from PIL import Image
import concurrent.futures
import tritonclient.http as httpclient
from botocore.config import Config
import numpy as np
import random
import boto3
import time



w,h = 856,480

urls = [
    "https://m.media-amazon.com/images/M/MV5BNDcwZDc2NTEtMzU0Ni00YTQyLWIyYTQtNTI3YjM0MzhmMmI4XkEyXkFqcGdeQXVyNTgyNTA4MjM@._V1_.jpg",
    "https://lh3.googleusercontent.com/05JfZ1ZdyzrRNvhJosUFdcjjJRFE7k2KhmeM2ujqeCbrcrCb1hkq7O_JdUBpQ3r9hi0YeSn4WgmKx3Ai8LHdM2SucxSzl9TRZ4fCAqETJ6WtHgE=s0",
    "https://assets.nintendo.com/image/upload/f_auto/q_auto/dpr_2.625/c_scale,w_400/ncom/en_US/games/switch/n/new-pokemon-snap-switch/hero",
    "https://images.nintendolife.com/d358c9f9118af/pokemon-go.900x.jpg",
    "https://cdn.vox-cdn.com/thumbor/IKt535q8LMnJDddmLL74TBtzv88=/0x266:1024x949/1280x854/cdn.vox-cdn.com/uploads/chorus_image/image/48942277/N3DS_PokemonSuperMysteryDungeon_MainIllustration_png_jpgcopy.0.0.jpg",
    "https://i.imgflip.com/3sn9mp.jpg",
    "https://techcrunch.com/wp-content/uploads/2017/08/cbsn.png"
]



endpoint_name = "od-load-test-model"
def read_image(i):
    url = random.choice(urls)
    
    response = requests.get(url)
    img = Image.open(BytesIO(response.content))

    img = img.resize((w, h), Image.ANTIALIAS)
    return np.asarray(img, dtype='uint8')



In [145]:
def invoke_endpoint(images):
    inputs = []
    outputs = []
    input_data = np.asarray(images, dtype='uint8')
    inputs.append(httpclient.InferInput("INPUT__0", [ len(input_data),h, w,3], "UINT8"))
    inputs[0].set_data_from_numpy(input_data, binary_data=True)
    outputs.append(httpclient.InferRequestedOutput("BBOX", binary_data=True))
    outputs.append(httpclient.InferRequestedOutput("LABELS", binary_data=True))
    outputs.append(httpclient.InferRequestedOutput("SCORES", binary_data=True))
    request_body, header_length = httpclient.InferenceServerClient.generate_request_body(
        inputs, outputs=outputs
    )


    runtime_sm_client = boto3.client("sagemaker-runtime",region_name="eu-west-1", config=Config(connect_timeout=5,
                                                                                 read_timeout=120,
                                                                                 retries={
                                                                                     'max_attempts': 20,
                                                                                     'mode': 'standard'

                                                                                 }))
    response = runtime_sm_client.invoke_endpoint(
        EndpointName=endpoint_name,
        ContentType="application/vnd.sagemaker-triton.binary+json;json-header-size={}".format(
            header_length
        ),
        Body=request_body,
        
    )
    header_length_prefix = "application/vnd.sagemaker-triton.binary+json;json-header-size="
    header_length_str = response["ContentType"][len(header_length_prefix) :]
    result = httpclient.InferenceServerClient.parse_response_body(
        response["Body"].read(), 
        header_length=int(header_length_str)
    )
    return result


In [146]:

batch_size = 12
samples = 20

def run():
    j = 0 
    with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
        while j < 100000:
            images_future = [executor.submit(read_image, i) for i in range(samples*batch_size)]
            buffer = []
            for i, future in enumerate(concurrent.futures.as_completed(images_future)):
                buffer.append(future.result())
                if len(buffer) >= batch_size:
                    st_time = time.time()
                    invoke_endpoint(buffer)
                    print(i, len(buffer),time.time() - st_time)
                    buffer.clear()
            if j % 5 == 0:
                print("j=",j)
            j+=1
            

In [148]:
1+1

2

In [147]:
run()



11 12 26.824105262756348
23 12 5.339818716049194
35 12 51.8453414440155
47 12 51.857054233551025
59 12 106.01303791999817
71 12 53.19753122329712
83 12 52.97249102592468
95 12 5.2242491245269775
107 12 3.3581738471984863
119 12 55.068113565444946
131 12 49.649627447128296
143 12 10.65463137626648
155 12 53.033745765686035
167 12 47.64116454124451
179 12 3.825765609741211
191 12 54.106940507888794
203 12 50.772104024887085
215 12 4.235815048217773
227 12 54.194751024246216
239 12 50.604727029800415
j= 0
11 12 5.334282875061035
23 12 53.142837047576904
35 12 52.85909175872803
47 12 4.331985950469971
59 12 53.17338728904724
71 12 46.69490313529968
83 12 3.9397337436676025
95 12 48.00019955635071
107 12 46.3467698097229
119 12 4.1943089962005615
131 12 48.45679235458374
143 12 46.61094260215759
155 12 4.049340486526489
167 12 48.785634994506836
179 12 48.044116735458374
191 12 4.065878868103027
203 12 52.23169946670532
215 12 50.64727830886841
227 12 4.310130596160889
239 12 53.42579913139

KeyboardInterrupt: 

In [None]:
print('hello')