PyTorch serve with custom translators returning Unknown exception #3032

samruds · 2024-03-20T06:04:55Z

🐛 Describe the bug

I am deploying a model with a new pytorch image. The model artifacts build and server worker is assigned a PID. However, the worker crashes without a clear exception (I have fixed any import errors in the implementation interface)

@pytest.fixture
def test_image():
    return Image.open(str(os.path.join(PYTORCH_SQUEEZENET_RESOURCE_DIR, "zidane.jpeg")))


@pytest.fixture
def squeezenet_inference_spec():
    class MySqueezeNetModel(InferenceSpec):
        def __init__(self) -> None:
            super().__init__()

        def invoke(self, input_object: object, model: object):
            with torch.no_grad():
                output = model(input_object)
            return output

        def load(self, model_dir: str):
            model = squeezenet1_1()
            model.load_state_dict(torch.load(model_dir + "/model.pth"))
            model.eval()
            return model

    return MySqueezeNetModel()


@pytest.fixture
def custom_request_translator():
    # request translator
    class MyRequestTranslator(CustomPayloadTranslator):
        def __init__(self):
            super().__init__()
            # Define image transformation
            self.transform = transforms.Compose(
                [
                    transforms.Resize(256),
                    transforms.CenterCrop(224),
                    transforms.ToTensor(),
                    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
                ]
            )

        # This function converts the payload to bytes - happens on client side
        def serialize_payload_to_bytes(self, payload: object) -> bytes:
            # converts an image to bytes
            image_tensor = self.transform(payload)
            input_batch = image_tensor.unsqueeze(0)
            input_ndarray = input_batch.numpy()
            return self._convert_numpy_to_bytes(input_ndarray)

        # This function converts the bytes to payload - happens on server side
        def deserialize_payload_from_stream(self, stream) -> torch.Tensor:
            # convert payload back to torch.Tensor
            np_array = np.load(io.BytesIO(stream.read()))
            return torch.from_numpy(np_array)

        def _convert_numpy_to_bytes(self, np_array: np.ndarray) -> bytes:
            buffer = io.BytesIO()
            np.save(buffer, np_array)
            return buffer.getvalue()

    return MyRequestTranslator()


@pytest.fixture
def custom_response_translator():
    # response translator
    class MyResponseTranslator(CustomPayloadTranslator):
        # This function converts the payload to bytes - happens on server side
        def serialize_payload_to_bytes(self, payload: torch.Tensor) -> bytes:
            return self._convert_numpy_to_bytes(payload.numpy())

        # This function converts the bytes to payload - happens on client side
        def deserialize_payload_from_stream(self, stream) -> object:
            return torch.from_numpy(np.load(io.BytesIO(stream.read())))

        def _convert_numpy_to_bytes(self, np_array: np.ndarray) -> bytes:
            buffer = io.BytesIO()
            np.save(buffer, np_array)
            return buffer.getvalue()

    return MyResponseTranslator()


@pytest.fixture
def squeezenet_schema(custom_request_translator, custom_response_translator):
    input_image = Image.open(os.path.join(PYTORCH_SQUEEZENET_RESOURCE_DIR, "zidane.jpeg"))
    output_tensor = torch.rand(3, 4)
    return SchemaBuilder(
        sample_input=input_image,
        sample_output=output_tensor,
        input_translator=custom_request_translator,
        output_translator=custom_response_translator,
    )


@pytest.fixture
def model_builder_inference_spec_schema_builder(squeezenet_inference_spec, squeezenet_schema):
    return ModelBuilder(
        model_path=PYTORCH_SQUEEZENET_RESOURCE_DIR,
        inference_spec=squeezenet_inference_spec,
        schema_builder=squeezenet_schema,
    )


@pytest.fixture
def model_builder(request):
    return request.getfixturevalue(request.param)

@pytest.mark.skipif(
    PYTHON_VERSION_IS_NOT_310,  # or NOT_RUNNING_ON_INF_EXP_DEV_PIPELINE,
    reason="The goal of these test are to test the serving components of our feature",
)
@pytest.mark.parametrize(
    "model_builder", ["model_builder_inference_spec_schema_builder"], indirect=True
)
@pytest.mark.slow_test
def test_happy_pytorch_sagemaker_endpoint(
    sagemaker_session,
    model_builder,
    cpu_instance_type,
    test_image,
):
    logger.info("Running in SAGEMAKER_ENDPOINT mode...")
    caught_ex = None

    iam_client = sagemaker_session.boto_session.client("iam")
    role_arn = iam_client.get_role(RoleName=ROLE_NAME)["Role"]["Arn"]

    model = model_builder.build(
        mode=Mode.SAGEMAKER_ENDPOINT, role_arn=role_arn, sagemaker_session=sagemaker_session
    )

    with timeout(minutes=SERVE_SAGEMAKER_ENDPOINT_TIMEOUT):
        try:
            logger.info("Deploying and predicting in SAGEMAKER_ENDPOINT mode...")
            predictor = model.deploy(instance_type=cpu_instance_type, initial_instance_count=1)
            logger.info("Endpoint successfully deployed.")
            predictor.predict(test_image)
        except Exception as e:
            caught_ex = e
        finally:
            cleanup_model_resources(
                sagemaker_session=model_builder.sagemaker_session,
                model_name=model.name,
                endpoint_name=model.endpoint_name,
            )
            if caught_ex:
                logger.exception(caught_ex)
                assert (
                    False
                ), f"{caught_ex} was thrown when running pytorch squeezenet sagemaker endpoint test"

Error logs

2024-03-20T05:50:51.648Z	2024-03-20T05:50:51,529 [INFO ] W-9003-model_1.0-stdout MODEL_LOG - service = model_loader.load(	AllTraffic/i-0cb527de76ac4f29a
	2024-03-20T05:50:51.648Z	2024-03-20T05:50:51,530 [INFO ] W-9003-model_1.0-stdout org.pytorch.serve.wlm.WorkerLifeCycle - Stopped Scanner - W-9003-model_1.0-stdout
	2024-03-20T05:50:51.648Z	2024-03-20T05:50:51,530 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - user_module = importlib.import_module(user_module_name)
	2024-03-20T05:50:51.648Z	2024-03-20T05:50:51,530 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - File "/opt/conda/lib/python3.10/importlib/init.py", line 126, in import_module
	2024-03-20T05:50:51.648Z	2024-03-20T05:50:51,531 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - return _bootstrap._gcd_import(name[level:], package, level)
	2024-03-20T05:50:51.648Z	2024-03-20T05:50:51,531 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - File "", line 1050, in _gcd_import
	2024-03-20T05:50:51.648Z	2024-03-20T05:50:51,531 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - File "", line 1027, in _find_and_load
	2024-03-20T05:50:51.648Z	2024-03-20T05:50:51,531 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - File "", line 1006, in _find_and_load_unlocked
	2024-03-20T05:50:51.648Z	2024-03-20T05:50:51,531 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - File "", line 688, in _load_unlocked
	2024-03-20T05:50:51.648Z	2024-03-20T05:50:51,532 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - File "", line 883, in exec_module
	2024-03-20T05:50:51.648Z	2024-03-20T05:50:51,532 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - File "", line 241, in _call_with_frames_removed
	2024-03-20T05:50:51.648Z	2024-03-20T05:50:51,532 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - File "/opt/ml/model/code/inference.py", line 10, in
	2024-03-20T05:50:51.648Z	2024-03-20T05:50:51,532 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - from sagemaker.serve.validations.check_integrity import perform_integrity_check
	2024-03-20T05:50:51.648Z	2024-03-20T05:50:51,532 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - File "/opt/conda/lib/python3.10/site-packages/sagemaker/serve/init.py", line 5, in
	2024-03-20T05:50:51.648Z	2024-03-20T05:50:51,533 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - from sagemaker.serve.builder.model_builder import ModelBuilder
	2024-03-20T05:50:51.648Z	2024-03-20T05:50:51,545 [INFO ] epollEventLoopGroup-5-2 org.pytorch.serve.wlm.WorkerThread - 9002 Worker disconnected. WORKER_STARTED
	2024-03-20T05:50:51.648Z	2024-03-20T05:50:51,546 [WARN ] W-9002-model_1.0 org.pytorch.serve.wlm.BatchAggregator - Load model failed: model, error: Worker died.
	2024-03-20T05:50:51.648Z	2024-03-20T05:50:51,547 [INFO ] W-9002-model_1.0 org.pytorch.serve.wlm.WorkerThread - Auto recovery start timestamp: 1710913851547
	2024-03-20T05:50:51.648Z	2024-03-20T05:50:51,547 [WARN ] W-9002-model_1.0 org.pytorch.serve.wlm.WorkerLifeCycle - terminateIOStreams() threadName=W-9002-model_1.0-stderr
	2024-03-20T05:50:51.648Z	2024-03-20T05:50:51,547 [WARN ] W-9002-model_1.0 org.pytorch.serve.wlm.WorkerLifeCycle - terminateIOStreams() threadName=W-9002-model_1.0-stdout
	2024-03-20T05:50:51.648Z	2024-03-20T05:50:51,547 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - File "/opt/conda/lib/python3.10/site-packages/sagemaker/serve/builder/model_builder.py", line 23, in
	2024-03-20T05:50:51.648Z	2024-03-20T05:50:51,548 [INFO ] W-9002-model_1.0-stdout org.pytorch.serve.wlm.WorkerLifeCycle - Stopped Scanner - W-9002-model_1.0-stdout
	2024-03-20T05:50:51.898Z	2024-03-20T05:50:51,547 [INFO ] W-9002-model_1.0 org.pytorch.serve.wlm.WorkerThread - Retry worker: 9002 in 1 seconds.
	2024-03-20T05:50:51.898Z	2024-03-20T05:50:51,885 [INFO ] W-9000-model_1.0-stdout MODEL_LOG - Backend worker process died.
	2024-03-20T05:50:51.898Z	2024-03-20T05:50:51,885 [INFO ] epollEventLoopGroup-5-3 org.pytorch.serve.wlm.WorkerThread - 9000 Worker disconnected. WORKER_STARTED
	2024-03-20T05:50:51.898Z	2024-03-20T05:50:51,889 [INFO ] W-9000-model_1.0-stdout MODEL_LOG - Traceback (most recent call last):
	2024-03-20T05:50:51.898Z	2024-03-20T05:50:51,890 [INFO ] W-9000-model_1.0-stdout MODEL_LOG - File "/opt/conda/lib/python3.10/site-packages/ts/model_service_worker.py", line 253, in
	2024-03-20T05:50:51.898Z	2024-03-20T05:50:51,890 [WARN ] W-9000-model_1.0 org.pytorch.serve.wlm.BatchAggregator - Load model failed: model, error: Worker died.
	2024-03-20T05:50:51.898Z	2024-03-20T05:50:51,890 [INFO ] W-9000-model_1.0-stdout MODEL_LOG - worker.run_server()
	2024-03-20T05:50:51.898Z	2024-03-20T05:50:51,891 [INFO ] W-9000-model_1.0 org.pytorch.serve.wlm.WorkerThread - Auto recovery start timestamp: 1710913851891
	2024-03-20T05:50:51.898Z	2024-03-20T05:50:51,892 [WARN ] W-9000-model_1.0 org.pytorch.serve.wlm.WorkerLifeCycle - terminateIOStreams() threadName=W-9000-model_1.0-stderr
	2024-03-20T05:50:51.898Z	2024-03-20T05:50:51,892 [WARN ] W-9000-model_1.0 org.pytorch.serve.wlm.WorkerLifeCycle - terminateIOStreams() threadName=W-9000-model_1.0-stdout
	2024-03-20T05:50:51.898Z	2024-03-20T05:50:51,892 [INFO ] W-9000-model_1.0 org.pytorch.serve.wlm.WorkerThread - Retry worker: 9000 in 1 seconds.
	2024-03-20T05:50:51.898Z	2024-03-20T05:50:51,891 [INFO ] W-9000-model_1.0-stdout MODEL_LOG - File "/opt/conda/lib/python3.10/site-packages/ts/model_service_worker.py", line 221, in run_server
	2024-03-20T05:50:52.149Z	2024-03-20T05:50:51,893 [INFO ] W-9000-model_1.0-stdout org.pytorch.serve.wlm.WorkerLifeCycle - Stopped Scanner - W-9000-model_1.0-stdout
	2024-03-20T05:50:52.149Z	2024-03-20T05:50:51,914 [INFO ] W-9001-model_1.0-stdout MODEL_LOG - Backend worker process died.
	2024-03-20T05:50:52.149Z	2024-03-20T05:50:51,915 [INFO ] W-9001-model_1.0-stdout MODEL_LOG - Traceback (most recent call last):
	2024-03-20T05:50:52.149Z	2024-03-20T05:50:51,915 [INFO ] W-9001-model_1.0-stdout MODEL_LOG - File "/opt/conda/lib/python3.10/site-packages/ts/model_service_worker.py", line 253, in
	2024-03-20T05:50:52.149Z	2024-03-20T05:50:51,916 [INFO ] W-9001-model_1.0-stdout MODEL_LOG - worker.run_server()
	2024-03-20T05:50:52.149Z	2024-03-20T05:50:51,916 [INFO ] W-9001-model_1.0-stdout MODEL_LOG - File "/opt/conda/lib/python3.10/site-packages/ts/model_service_worker.py", line 221, in run_server
	2024-03-20T05:50:52.149Z	2024-03-20T05:50:51,915 [INFO ] epollEventLoopGroup-5-4 org.pytorch.serve.wlm.WorkerThread - 9001 Worker disconnected. WORKER_STARTED
	2024-03-20T05:50:52.149Z	2024-03-20T05:50:51,917 [WARN ] W-9001-model_1.0 org.pytorch.serve.wlm.BatchAggregator - Load model failed: model, error: Worker died.

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,529 [INFO ] W-9003-model_1.0-stdout MODEL_LOG - service = model_loader.load( AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,530 [INFO ] W-9003-model_1.0-stdout org.pytorch.serve.wlm.WorkerLifeCycle - Stopped Scanner - W-9003-model_1.0-stdout AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,530 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - user_module = importlib.import_module(user_module_name) AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,530 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - File "/opt/conda/lib/python3.10/importlib/init.py", line 126, in import_module AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,531 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - return _bootstrap._gcd_import(name[level:], package, level) AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,531 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - File "", line 1050, in _gcd_import AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,531 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - File "", line 1027, in _find_and_load AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,531 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - File "", line 1006, in _find_and_load_unlocked AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,531 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - File "", line 688, in _load_unlocked AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,532 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - File "", line 883, in exec_module AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,532 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - File "", line 241, in _call_with_frames_removed AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,532 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - File "/opt/ml/model/code/inference.py", line 10, in AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,532 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - from sagemaker.serve.validations.check_integrity import perform_integrity_check AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,532 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - File "/opt/conda/lib/python3.10/site-packages/sagemaker/serve/init.py", line 5, in AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,533 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - from sagemaker.serve.builder.model_builder import ModelBuilder AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,545 [INFO ] epollEventLoopGroup-5-2 org.pytorch.serve.wlm.WorkerThread - 9002 Worker disconnected. WORKER_STARTED AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,546 [WARN ] W-9002-model_1.0 org.pytorch.serve.wlm.BatchAggregator - Load model failed: model, error: Worker died. AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,547 [INFO ] W-9002-model_1.0 org.pytorch.serve.wlm.WorkerThread - Auto recovery start timestamp: 1710913851547 AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,547 [WARN ] W-9002-model_1.0 org.pytorch.serve.wlm.WorkerLifeCycle - terminateIOStreams() threadName=W-9002-model_1.0-stderr AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,547 [WARN ] W-9002-model_1.0 org.pytorch.serve.wlm.WorkerLifeCycle - terminateIOStreams() threadName=W-9002-model_1.0-stdout AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,547 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - File "/opt/conda/lib/python3.10/site-packages/sagemaker/serve/builder/model_builder.py", line 23, in AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,548 [INFO ] W-9002-model_1.0-stdout org.pytorch.serve.wlm.WorkerLifeCycle - Stopped Scanner - W-9002-model_1.0-stdout AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.898Z 2024-03-20T05:50:51,547 [INFO ] W-9002-model_1.0 org.pytorch.serve.wlm.WorkerThread - Retry worker: 9002 in 1 seconds. AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.898Z 2024-03-20T05:50:51,885 [INFO ] W-9000-model_1.0-stdout MODEL_LOG - Backend worker process died. AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.898Z 2024-03-20T05:50:51,885 [INFO ] epollEventLoopGroup-5-3 org.pytorch.serve.wlm.WorkerThread - 9000 Worker disconnected. WORKER_STARTED AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.898Z 2024-03-20T05:50:51,889 [INFO ] W-9000-model_1.0-stdout MODEL_LOG - Traceback (most recent call last): AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.898Z 2024-03-20T05:50:51,890 [INFO ] W-9000-model_1.0-stdout MODEL_LOG - File "/opt/conda/lib/python3.10/site-packages/ts/model_service_worker.py", line 253, in AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.898Z 2024-03-20T05:50:51,890 [WARN ] W-9000-model_1.0 org.pytorch.serve.wlm.BatchAggregator - Load model failed: model, error: Worker died. AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.898Z 2024-03-20T05:50:51,890 [INFO ] W-9000-model_1.0-stdout MODEL_LOG - worker.run_server() AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.898Z 2024-03-20T05:50:51,891 [INFO ] W-9000-model_1.0 org.pytorch.serve.wlm.WorkerThread - Auto recovery start timestamp: 1710913851891 AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.898Z 2024-03-20T05:50:51,892 [WARN ] W-9000-model_1.0 org.pytorch.serve.wlm.WorkerLifeCycle - terminateIOStreams() threadName=W-9000-model_1.0-stderr AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.898Z 2024-03-20T05:50:51,892 [WARN ] W-9000-model_1.0 org.pytorch.serve.wlm.WorkerLifeCycle - terminateIOStreams() threadName=W-9000-model_1.0-stdout AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.898Z 2024-03-20T05:50:51,892 [INFO ] W-9000-model_1.0 org.pytorch.serve.wlm.WorkerThread - Retry worker: 9000 in 1 seconds. AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.898Z 2024-03-20T05:50:51,891 [INFO ] W-9000-model_1.0-stdout MODEL_LOG - File "/opt/conda/lib/python3.10/site-packages/ts/model_service_worker.py", line 221, in run_server AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:52.149Z 2024-03-20T05:50:51,893 [INFO ] W-9000-model_1.0-stdout org.pytorch.serve.wlm.WorkerLifeCycle - Stopped Scanner - W-9000-model_1.0-stdout AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:52.149Z 2024-03-20T05:50:51,914 [INFO ] W-9001-model_1.0-stdout MODEL_LOG - Backend worker process died. AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:52.149Z 2024-03-20T05:50:51,915 [INFO ] W-9001-model_1.0-stdout MODEL_LOG - Traceback (most recent call last): AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:52.149Z 2024-03-20T05:50:51,915 [INFO ] W-9001-model_1.0-stdout MODEL_LOG - File "/opt/conda/lib/python3.10/site-packages/ts/model_service_worker.py", line 253, in AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:52.149Z 2024-03-20T05:50:51,916 [INFO ] W-9001-model_1.0-stdout MODEL_LOG - worker.run_server() AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:52.149Z 2024-03-20T05:50:51,916 [INFO ] W-9001-model_1.0-stdout MODEL_LOG - File "/opt/conda/lib/python3.10/site-packages/ts/model_service_worker.py", line 221, in run_server AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:52.149Z 2024-03-20T05:50:51,915 [INFO ] epollEventLoopGroup-5-4 org.pytorch.serve.wlm.WorkerThread - 9001 Worker disconnected. WORKER_STARTED AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:52.149Z 2024-03-20T05:50:51,917 [WARN ] W-9001-model_1.0 org.pytorch.serve.wlm.BatchAggregator - Load model failed: model, error: Worker died.``

Installation instructions

Install torchserve from source: No it is an image through sagemaker
Docker: no

Model Packaing

aws/sagemaker-python-sdk#4502

config.properties

2024-03-20T05:50:45.547Z	Torchserve version: 0.8.2	AllTraffic/i-0cb527de76ac4f29a
	2024-03-20T05:50:45.547Z	TS Home: /opt/conda/lib/python3.10/site-packages
	2024-03-20T05:50:45.547Z	Current directory: /
	2024-03-20T05:50:45.547Z	Temp directory: /home/model-server/tmp
	2024-03-20T05:50:45.547Z	Metrics config path: /opt/conda/lib/python3.10/site-packages/ts/configs/metrics.yaml
	2024-03-20T05:50:45.547Z	Number of GPUs: 0
	2024-03-20T05:50:45.547Z	Number of CPUs: 4
	2024-03-20T05:50:45.547Z	Max heap size: 4008 M
	2024-03-20T05:50:45.547Z	Python executable: /opt/conda/bin/python3.10
	2024-03-20T05:50:45.547Z	Config file: /etc/sagemaker-ts.properties
	2024-03-20T05:50:45.547Z	Inference address: http://0.0.0.0:8080
	2024-03-20T05:50:45.547Z	Management address: http://0.0.0.0:8080
	2024-03-20T05:50:45.547Z	Metrics address: http://127.0.0.1:8082
	2024-03-20T05:50:45.547Z	Model Store: /.sagemaker/ts/models
	2024-03-20T05:50:45.547Z	Initial Models: model=/opt/ml/model
	2024-03-20T05:50:45.547Z	Log dir: /logs
	2024-03-20T05:50:45.547Z	Metrics dir: /logs
	2024-03-20T05:50:45.547Z	Netty threads: 0
	2024-03-20T05:50:45.547Z	Netty client threads: 0
	2024-03-20T05:50:45.547Z	Default workers per model: 4
	2024-03-20T05:50:45.547Z	Blacklist Regex: N/A
	2024-03-20T05:50:45.547Z	Maximum Response Size: 6553500
	2024-03-20T05:50:45.547Z	Maximum Request Size: 6553500
	2024-03-20T05:50:45.547Z	Limit Maximum Image Pixels: true
	2024-03-20T05:50:45.547Z	Prefer direct buffer: false
	2024-03-20T05:50:45.547Z	Allowed Urls: [file://.\|http(s)?://.]
	2024-03-20T05:50:45.547Z	Custom python dependency for model allowed: false

2024-03-20T05:50:45.547Z Torchserve version: 0.8.2 AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z TS Home: /opt/conda/lib/python3.10/site-packages AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Current directory: / AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Temp directory: /home/model-server/tmp AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Metrics config path: /opt/conda/lib/python3.10/site-packages/ts/configs/metrics.yaml AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Number of GPUs: 0 AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Number of CPUs: 4 AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Max heap size: 4008 M AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Python executable: /opt/conda/bin/python3.10 AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Config file: /etc/sagemaker-ts.properties AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Inference address: http://0.0.0.0:8080 AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Management address: http://0.0.0.0:8080 AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Metrics address: http://127.0.0.1:8082 AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Model Store: /.sagemaker/ts/models AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Initial Models: model=/opt/ml/model AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Log dir: /logs AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Metrics dir: /logs AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Netty threads: 0 AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Netty client threads: 0 AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Default workers per model: 4 AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Blacklist Regex: N/A AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Maximum Response Size: 6553500 AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Maximum Request Size: 6553500 AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Limit Maximum Image Pixels: true AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Prefer direct buffer: false AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Allowed Urls: [file://.|http(s)?://.] AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Custom python dependency for model allowed: false``

Versions

Torchserve version: 0.8.2
Python runtime: 3.10.9

Repro instructions

N/A

Possible Solution

No response

The text was updated successfully, but these errors were encountered:

namannandan · 2024-03-20T17:38:47Z

@samruds, looking at the logs:

.....
.....
2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,532 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - File "/opt/ml/model/code/inference.py", line 10, in [AllTraffic/i-0cb527de76ac4f29a](https://us-west-2.console.aws.amazon.com/cloudwatch/home?region=us-west-2#logsV2:log-groups/log-group/$252Faws$252Fsagemaker$252FEndpoints$252Fpytorch-inference-2024-03-20-05-48-16-301/log-events//AllTraffic$252Fi-0cb527de76ac4f29a$3Fstart$3D1710913851548$26refEventId$3D38154653862238670871230805536070088255722857549318979634)

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,532 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - from sagemaker.serve.validations.check_integrity import perform_integrity_check [AllTraffic/i-0cb527de76ac4f29a](https://us-west-2.console.aws.amazon.com/cloudwatch/home?region=us-west-2#logsV2:log-groups/log-group/$252Faws$252Fsagemaker$252FEndpoints$252Fpytorch-inference-2024-03-20-05-48-16-301/log-events//AllTraffic$252Fi-0cb527de76ac4f29a$3Fstart$3D1710913851548$26refEventId$3D38154653862238670871230805536070088255722857549318979635)

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,532 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - File "/opt/conda/lib/python3.10/site-packages/sagemaker/serve/init.py", line 5, in [AllTraffic/i-0cb527de76ac4f29a](https://us-west-2.console.aws.amazon.com/cloudwatch/home?region=us-west-2#logsV2:log-groups/log-group/$252Faws$252Fsagemaker$252FEndpoints$252Fpytorch-inference-2024-03-20-05-48-16-301/log-events//AllTraffic$252Fi-0cb527de76ac4f29a$3Fstart$3D1710913851548$26refEventId$3D38154653862238670871230805536070088255722857549318979636)

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,533 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - from sagemaker.serve.builder.model_builder import ModelBuilder [AllTraffic/i-0cb527de76ac4f29a](https://us-west-2.console.aws.amazon.com/cloudwatch/home?region=us-west-2#logsV2:log-groups/log-group/$252Faws$252Fsagemaker$252FEndpoints$252Fpytorch-inference-2024-03-20-05-48-16-301/log-events//AllTraffic$252Fi-0cb527de76ac4f29a$3Fstart$3D1710913851548$26refEventId$3D38154653862238670871230805536070088255722857549318979637)
.....
.....

It seems like the model load failed because:

Import failed because of absence of package or presence of incorrect version of the package
All the artifacts required to load the model were not provided with the model tar archive

A few follow up questions to help debug further:

Could you share your inference.py implementation?
How is the model tar archive created and what are its contents?
How are package dependencies of inference.py if any being installed?

samruds · 2024-03-20T23:33:40Z

Could you share your inference.py implementation?

`"""This module is for SageMaker inference.py."""
from future import absolute_import
import os
import io
import cloudpickle
import shutil
import platform
from pathlib import Path
from functools import partial
from sagemaker.serve.validations.check_integrity import perform_integrity_check
from sagemaker.serve.spec.inference_spec import InferenceSpec
from sagemaker.serve.detector.image_detector import _detect_framework_and_version, _get_model_base
from sagemaker.serve.detector.pickler import load_xgboost_from_json
import logging

logger = logging.getLogger(name)

inference_spec = None
native_model = None
schema_builder = None

def model_fn(model_dir):
"""Placeholder docstring"""
shared_libs_path = Path(model_dir + "/shared_libs")

if shared_libs_path.exists():
    # before importing, place dynamic linked libraries in shared lib path
    shutil.copytree(shared_libs_path, "/lib", dirs_exist_ok=True)

serve_path = Path(__file__).parent.joinpath("serve.pkl")
with open(str(serve_path), mode="rb") as file:
    global inference_spec, native_model, schema_builder
    obj = cloudpickle.load(file)
    if isinstance(obj[0], InferenceSpec):
        inference_spec, schema_builder = obj
    elif isinstance(obj[0], str) and obj[0] == "xgboost":
        model_class_name = os.getenv("MODEL_CLASS_NAME")
        model_save_path = Path(__file__).parent.joinpath("model.json")
        native_model = load_xgboost_from_json(
            model_save_path=str(model_save_path), class_name=model_class_name
        )
        schema_builder = obj[1]
    else:
        native_model, schema_builder = obj
if native_model:
    framework, _ = _detect_framework_and_version(
        model_base=str(_get_model_base(model=native_model))
    )
    if framework == "pytorch":
        native_model.eval()
    return native_model if callable(native_model) else native_model.predict
elif inference_spec:
    return partial(inference_spec.invoke, model=inference_spec.load(model_dir))

def input_fn(input_data, content_type):
"""Placeholder docstring"""
try:
if hasattr(schema_builder, "custom_input_translator"):
return schema_builder.custom_input_translator.deserialize(
io.BytesIO(input_data), content_type
)
else:
return schema_builder.input_deserializer.deserialize(
io.BytesIO(input_data), content_type[0]
)
except Exception as e:
raise Exception("Encountered error in deserialize_request.") from e

def predict_fn(input_data, predict_callable):
"""Placeholder docstring"""
return predict_callable(input_data)

def output_fn(predictions, accept_type):
"""Placeholder docstring"""
try:
if hasattr(schema_builder, "custom_output_translator"):
return schema_builder.custom_output_translator.serialize(predictions, accept_type)
else:
return schema_builder.output_serializer.serialize(predictions)
except Exception as e:
logger.error("Encountered error: %s in serialize_response." % e)
raise Exception("Encountered error in serialize_response.") from e

def _run_preflight_diagnostics():
_py_vs_parity_check()
_pickle_file_integrity_check()

def _py_vs_parity_check():
container_py_vs = platform.python_version()
local_py_vs = os.getenv("LOCAL_PYTHON")

if not local_py_vs or container_py_vs.split(".")[1] != local_py_vs.split(".")[1]:
    logger.warning(
        f"The local python version {local_py_vs} differs from the python version "
        f"{container_py_vs} on the container. Please align the two to avoid unexpected behavior"
    )

def _pickle_file_integrity_check():
with open("/opt/ml/model/code/serve.pkl", "rb") as f:
buffer = f.read()

metadeata_path = Path("/opt/ml/model/code/metadata.json")
perform_integrity_check(buffer=buffer, metadata_path=metadeata_path)

on import, execute

_run_preflight_diagnostics()`

samruds · 2024-03-20T23:43:10Z

It is built and deployed using an interface in a SDK https://github.com/aws/sagemaker-python-sdk/blob/345381e1336c3188b26753fb21219484fc91cc8e/src/sagemaker/serve/builder/model_builder.py#L79
We install the needed dependencies at the time of install. These are the approved images we use for containers https://github.com/aws/deep-learning-containers/blob/master/available_images.md

samruds · 2024-04-01T23:19:38Z

Issue was resolved on our end by bringing extra dependencies for pytorch servers.

namannandan added triaged Issue has been reviewed and triaged debugging sagemaker and removed triaged Issue has been reviewed and triaged labels Mar 20, 2024

lxning assigned namannandan Mar 26, 2024

lxning added this to the v0.10.1 milestone Mar 26, 2024

lxning added this to Backlog in v0.11.0 lifecycle Mar 26, 2024

samruds closed this as completed Apr 1, 2024

namannandan moved this from Backlog to Done in v0.11.0 lifecycle Apr 26, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

PyTorch serve with custom translators returning Unknown exception #3032

PyTorch serve with custom translators returning Unknown exception #3032

samruds commented Mar 20, 2024 •

edited

namannandan commented Mar 20, 2024 •

edited

samruds commented Mar 20, 2024 •

edited

samruds commented Mar 20, 2024

samruds commented Apr 1, 2024

PyTorch serve with custom translators returning Unknown exception #3032

PyTorch serve with custom translators returning Unknown exception #3032

Comments

samruds commented Mar 20, 2024 • edited

🐛 Describe the bug

Error logs

Installation instructions

Model Packaing

config.properties

Versions

Repro instructions

Possible Solution

namannandan commented Mar 20, 2024 • edited

samruds commented Mar 20, 2024 • edited

on import, execute

samruds commented Mar 20, 2024

samruds commented Apr 1, 2024

samruds commented Mar 20, 2024 •

edited

namannandan commented Mar 20, 2024 •

edited

samruds commented Mar 20, 2024 •

edited