Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We鈥檒l occasionally send you account related emails.

Already on GitHub? Sign in to your account

PyTorch serve with custom translators returning Unknown exception #3032

Closed
samruds opened this issue Mar 20, 2024 · 4 comments
Closed

PyTorch serve with custom translators returning Unknown exception #3032

samruds opened this issue Mar 20, 2024 · 4 comments
Assignees
Labels
debugging sagemaker triaged Issue has been reviewed and triaged
Milestone

Comments

@samruds
Copy link

samruds commented Mar 20, 2024

馃悰 Describe the bug

I am deploying a model with a new pytorch image. The model artifacts build and server worker is assigned a PID. However, the worker crashes without a clear exception (I have fixed any import errors in the implementation interface)

@pytest.fixture
def test_image():
    return Image.open(str(os.path.join(PYTORCH_SQUEEZENET_RESOURCE_DIR, "zidane.jpeg")))


@pytest.fixture
def squeezenet_inference_spec():
    class MySqueezeNetModel(InferenceSpec):
        def __init__(self) -> None:
            super().__init__()

        def invoke(self, input_object: object, model: object):
            with torch.no_grad():
                output = model(input_object)
            return output

        def load(self, model_dir: str):
            model = squeezenet1_1()
            model.load_state_dict(torch.load(model_dir + "/model.pth"))
            model.eval()
            return model

    return MySqueezeNetModel()


@pytest.fixture
def custom_request_translator():
    # request translator
    class MyRequestTranslator(CustomPayloadTranslator):
        def __init__(self):
            super().__init__()
            # Define image transformation
            self.transform = transforms.Compose(
                [
                    transforms.Resize(256),
                    transforms.CenterCrop(224),
                    transforms.ToTensor(),
                    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
                ]
            )

        # This function converts the payload to bytes - happens on client side
        def serialize_payload_to_bytes(self, payload: object) -> bytes:
            # converts an image to bytes
            image_tensor = self.transform(payload)
            input_batch = image_tensor.unsqueeze(0)
            input_ndarray = input_batch.numpy()
            return self._convert_numpy_to_bytes(input_ndarray)

        # This function converts the bytes to payload - happens on server side
        def deserialize_payload_from_stream(self, stream) -> torch.Tensor:
            # convert payload back to torch.Tensor
            np_array = np.load(io.BytesIO(stream.read()))
            return torch.from_numpy(np_array)

        def _convert_numpy_to_bytes(self, np_array: np.ndarray) -> bytes:
            buffer = io.BytesIO()
            np.save(buffer, np_array)
            return buffer.getvalue()

    return MyRequestTranslator()


@pytest.fixture
def custom_response_translator():
    # response translator
    class MyResponseTranslator(CustomPayloadTranslator):
        # This function converts the payload to bytes - happens on server side
        def serialize_payload_to_bytes(self, payload: torch.Tensor) -> bytes:
            return self._convert_numpy_to_bytes(payload.numpy())

        # This function converts the bytes to payload - happens on client side
        def deserialize_payload_from_stream(self, stream) -> object:
            return torch.from_numpy(np.load(io.BytesIO(stream.read())))

        def _convert_numpy_to_bytes(self, np_array: np.ndarray) -> bytes:
            buffer = io.BytesIO()
            np.save(buffer, np_array)
            return buffer.getvalue()

    return MyResponseTranslator()


@pytest.fixture
def squeezenet_schema(custom_request_translator, custom_response_translator):
    input_image = Image.open(os.path.join(PYTORCH_SQUEEZENET_RESOURCE_DIR, "zidane.jpeg"))
    output_tensor = torch.rand(3, 4)
    return SchemaBuilder(
        sample_input=input_image,
        sample_output=output_tensor,
        input_translator=custom_request_translator,
        output_translator=custom_response_translator,
    )


@pytest.fixture
def model_builder_inference_spec_schema_builder(squeezenet_inference_spec, squeezenet_schema):
    return ModelBuilder(
        model_path=PYTORCH_SQUEEZENET_RESOURCE_DIR,
        inference_spec=squeezenet_inference_spec,
        schema_builder=squeezenet_schema,
    )


@pytest.fixture
def model_builder(request):
    return request.getfixturevalue(request.param)

@pytest.mark.skipif(
    PYTHON_VERSION_IS_NOT_310,  # or NOT_RUNNING_ON_INF_EXP_DEV_PIPELINE,
    reason="The goal of these test are to test the serving components of our feature",
)
@pytest.mark.parametrize(
    "model_builder", ["model_builder_inference_spec_schema_builder"], indirect=True
)
@pytest.mark.slow_test
def test_happy_pytorch_sagemaker_endpoint(
    sagemaker_session,
    model_builder,
    cpu_instance_type,
    test_image,
):
    logger.info("Running in SAGEMAKER_ENDPOINT mode...")
    caught_ex = None

    iam_client = sagemaker_session.boto_session.client("iam")
    role_arn = iam_client.get_role(RoleName=ROLE_NAME)["Role"]["Arn"]

    model = model_builder.build(
        mode=Mode.SAGEMAKER_ENDPOINT, role_arn=role_arn, sagemaker_session=sagemaker_session
    )

    with timeout(minutes=SERVE_SAGEMAKER_ENDPOINT_TIMEOUT):
        try:
            logger.info("Deploying and predicting in SAGEMAKER_ENDPOINT mode...")
            predictor = model.deploy(instance_type=cpu_instance_type, initial_instance_count=1)
            logger.info("Endpoint successfully deployed.")
            predictor.predict(test_image)
        except Exception as e:
            caught_ex = e
        finally:
            cleanup_model_resources(
                sagemaker_session=model_builder.sagemaker_session,
                model_name=model.name,
                endpoint_name=model.endpoint_name,
            )
            if caught_ex:
                logger.exception(caught_ex)
                assert (
                    False
                ), f"{caught_ex} was thrown when running pytorch squeezenet sagemaker endpoint test"

Error logs

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,529 [INFO ] W-9003-model_1.0-stdout MODEL_LOG - service = model_loader.load( AllTraffic/i-0cb527de76ac4f29a
2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,530 [INFO ] W-9003-model_1.0-stdout org.pytorch.serve.wlm.WorkerLifeCycle - Stopped Scanner - W-9003-model_1.0-stdout
2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,530 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - user_module = importlib.import_module(user_module_name)
2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,530 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - File "/opt/conda/lib/python3.10/importlib/init.py", line 126, in import_module
2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,531 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - return _bootstrap._gcd_import(name[level:], package, level)
2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,531 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - File "", line 1050, in _gcd_import
2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,531 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - File "", line 1027, in _find_and_load
2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,531 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - File "", line 1006, in _find_and_load_unlocked
2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,531 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - File "", line 688, in _load_unlocked
2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,532 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - File "", line 883, in exec_module
2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,532 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - File "", line 241, in _call_with_frames_removed
2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,532 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - File "/opt/ml/model/code/inference.py", line 10, in
2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,532 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - from sagemaker.serve.validations.check_integrity import perform_integrity_check
2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,532 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - File "/opt/conda/lib/python3.10/site-packages/sagemaker/serve/init.py", line 5, in
2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,533 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - from sagemaker.serve.builder.model_builder import ModelBuilder
2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,545 [INFO ] epollEventLoopGroup-5-2 org.pytorch.serve.wlm.WorkerThread - 9002 Worker disconnected. WORKER_STARTED
2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,546 [WARN ] W-9002-model_1.0 org.pytorch.serve.wlm.BatchAggregator - Load model failed: model, error: Worker died.
2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,547 [INFO ] W-9002-model_1.0 org.pytorch.serve.wlm.WorkerThread - Auto recovery start timestamp: 1710913851547
2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,547 [WARN ] W-9002-model_1.0 org.pytorch.serve.wlm.WorkerLifeCycle - terminateIOStreams() threadName=W-9002-model_1.0-stderr
2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,547 [WARN ] W-9002-model_1.0 org.pytorch.serve.wlm.WorkerLifeCycle - terminateIOStreams() threadName=W-9002-model_1.0-stdout
2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,547 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - File "/opt/conda/lib/python3.10/site-packages/sagemaker/serve/builder/model_builder.py", line 23, in
2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,548 [INFO ] W-9002-model_1.0-stdout org.pytorch.serve.wlm.WorkerLifeCycle - Stopped Scanner - W-9002-model_1.0-stdout
2024-03-20T05:50:51.898Z 2024-03-20T05:50:51,547 [INFO ] W-9002-model_1.0 org.pytorch.serve.wlm.WorkerThread - Retry worker: 9002 in 1 seconds.
2024-03-20T05:50:51.898Z 2024-03-20T05:50:51,885 [INFO ] W-9000-model_1.0-stdout MODEL_LOG - Backend worker process died.
2024-03-20T05:50:51.898Z 2024-03-20T05:50:51,885 [INFO ] epollEventLoopGroup-5-3 org.pytorch.serve.wlm.WorkerThread - 9000 Worker disconnected. WORKER_STARTED
2024-03-20T05:50:51.898Z 2024-03-20T05:50:51,889 [INFO ] W-9000-model_1.0-stdout MODEL_LOG - Traceback (most recent call last):
2024-03-20T05:50:51.898Z 2024-03-20T05:50:51,890 [INFO ] W-9000-model_1.0-stdout MODEL_LOG - File "/opt/conda/lib/python3.10/site-packages/ts/model_service_worker.py", line 253, in
2024-03-20T05:50:51.898Z 2024-03-20T05:50:51,890 [WARN ] W-9000-model_1.0 org.pytorch.serve.wlm.BatchAggregator - Load model failed: model, error: Worker died.
2024-03-20T05:50:51.898Z 2024-03-20T05:50:51,890 [INFO ] W-9000-model_1.0-stdout MODEL_LOG - worker.run_server()
2024-03-20T05:50:51.898Z 2024-03-20T05:50:51,891 [INFO ] W-9000-model_1.0 org.pytorch.serve.wlm.WorkerThread - Auto recovery start timestamp: 1710913851891
2024-03-20T05:50:51.898Z 2024-03-20T05:50:51,892 [WARN ] W-9000-model_1.0 org.pytorch.serve.wlm.WorkerLifeCycle - terminateIOStreams() threadName=W-9000-model_1.0-stderr
2024-03-20T05:50:51.898Z 2024-03-20T05:50:51,892 [WARN ] W-9000-model_1.0 org.pytorch.serve.wlm.WorkerLifeCycle - terminateIOStreams() threadName=W-9000-model_1.0-stdout
2024-03-20T05:50:51.898Z 2024-03-20T05:50:51,892 [INFO ] W-9000-model_1.0 org.pytorch.serve.wlm.WorkerThread - Retry worker: 9000 in 1 seconds.
2024-03-20T05:50:51.898Z 2024-03-20T05:50:51,891 [INFO ] W-9000-model_1.0-stdout MODEL_LOG - File "/opt/conda/lib/python3.10/site-packages/ts/model_service_worker.py", line 221, in run_server
2024-03-20T05:50:52.149Z 2024-03-20T05:50:51,893 [INFO ] W-9000-model_1.0-stdout org.pytorch.serve.wlm.WorkerLifeCycle - Stopped Scanner - W-9000-model_1.0-stdout
2024-03-20T05:50:52.149Z 2024-03-20T05:50:51,914 [INFO ] W-9001-model_1.0-stdout MODEL_LOG - Backend worker process died.
2024-03-20T05:50:52.149Z 2024-03-20T05:50:51,915 [INFO ] W-9001-model_1.0-stdout MODEL_LOG - Traceback (most recent call last):
2024-03-20T05:50:52.149Z 2024-03-20T05:50:51,915 [INFO ] W-9001-model_1.0-stdout MODEL_LOG - File "/opt/conda/lib/python3.10/site-packages/ts/model_service_worker.py", line 253, in
2024-03-20T05:50:52.149Z 2024-03-20T05:50:51,916 [INFO ] W-9001-model_1.0-stdout MODEL_LOG - worker.run_server()
2024-03-20T05:50:52.149Z 2024-03-20T05:50:51,916 [INFO ] W-9001-model_1.0-stdout MODEL_LOG - File "/opt/conda/lib/python3.10/site-packages/ts/model_service_worker.py", line 221, in run_server
2024-03-20T05:50:52.149Z 2024-03-20T05:50:51,915 [INFO ] epollEventLoopGroup-5-4 org.pytorch.serve.wlm.WorkerThread - 9001 Worker disconnected. WORKER_STARTED
2024-03-20T05:50:52.149Z 2024-03-20T05:50:51,917 [WARN ] W-9001-model_1.0 org.pytorch.serve.wlm.BatchAggregator - Load model failed: model, error: Worker died.

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,529 [INFO ] W-9003-model_1.0-stdout MODEL_LOG - service = model_loader.load( AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,530 [INFO ] W-9003-model_1.0-stdout org.pytorch.serve.wlm.WorkerLifeCycle - Stopped Scanner - W-9003-model_1.0-stdout AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,530 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - user_module = importlib.import_module(user_module_name) AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,530 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - File "/opt/conda/lib/python3.10/importlib/init.py", line 126, in import_module AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,531 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - return _bootstrap._gcd_import(name[level:], package, level) AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,531 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - File "", line 1050, in _gcd_import AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,531 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - File "", line 1027, in _find_and_load AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,531 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - File "", line 1006, in _find_and_load_unlocked AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,531 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - File "", line 688, in _load_unlocked AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,532 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - File "", line 883, in exec_module AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,532 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - File "", line 241, in _call_with_frames_removed AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,532 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - File "/opt/ml/model/code/inference.py", line 10, in AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,532 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - from sagemaker.serve.validations.check_integrity import perform_integrity_check AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,532 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - File "/opt/conda/lib/python3.10/site-packages/sagemaker/serve/init.py", line 5, in AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,533 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - from sagemaker.serve.builder.model_builder import ModelBuilder AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,545 [INFO ] epollEventLoopGroup-5-2 org.pytorch.serve.wlm.WorkerThread - 9002 Worker disconnected. WORKER_STARTED AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,546 [WARN ] W-9002-model_1.0 org.pytorch.serve.wlm.BatchAggregator - Load model failed: model, error: Worker died. AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,547 [INFO ] W-9002-model_1.0 org.pytorch.serve.wlm.WorkerThread - Auto recovery start timestamp: 1710913851547 AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,547 [WARN ] W-9002-model_1.0 org.pytorch.serve.wlm.WorkerLifeCycle - terminateIOStreams() threadName=W-9002-model_1.0-stderr AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,547 [WARN ] W-9002-model_1.0 org.pytorch.serve.wlm.WorkerLifeCycle - terminateIOStreams() threadName=W-9002-model_1.0-stdout AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,547 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - File "/opt/conda/lib/python3.10/site-packages/sagemaker/serve/builder/model_builder.py", line 23, in AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,548 [INFO ] W-9002-model_1.0-stdout org.pytorch.serve.wlm.WorkerLifeCycle - Stopped Scanner - W-9002-model_1.0-stdout AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.898Z 2024-03-20T05:50:51,547 [INFO ] W-9002-model_1.0 org.pytorch.serve.wlm.WorkerThread - Retry worker: 9002 in 1 seconds. AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.898Z 2024-03-20T05:50:51,885 [INFO ] W-9000-model_1.0-stdout MODEL_LOG - Backend worker process died. AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.898Z 2024-03-20T05:50:51,885 [INFO ] epollEventLoopGroup-5-3 org.pytorch.serve.wlm.WorkerThread - 9000 Worker disconnected. WORKER_STARTED AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.898Z 2024-03-20T05:50:51,889 [INFO ] W-9000-model_1.0-stdout MODEL_LOG - Traceback (most recent call last): AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.898Z 2024-03-20T05:50:51,890 [INFO ] W-9000-model_1.0-stdout MODEL_LOG - File "/opt/conda/lib/python3.10/site-packages/ts/model_service_worker.py", line 253, in AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.898Z 2024-03-20T05:50:51,890 [WARN ] W-9000-model_1.0 org.pytorch.serve.wlm.BatchAggregator - Load model failed: model, error: Worker died. AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.898Z 2024-03-20T05:50:51,890 [INFO ] W-9000-model_1.0-stdout MODEL_LOG - worker.run_server() AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.898Z 2024-03-20T05:50:51,891 [INFO ] W-9000-model_1.0 org.pytorch.serve.wlm.WorkerThread - Auto recovery start timestamp: 1710913851891 AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.898Z 2024-03-20T05:50:51,892 [WARN ] W-9000-model_1.0 org.pytorch.serve.wlm.WorkerLifeCycle - terminateIOStreams() threadName=W-9000-model_1.0-stderr AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.898Z 2024-03-20T05:50:51,892 [WARN ] W-9000-model_1.0 org.pytorch.serve.wlm.WorkerLifeCycle - terminateIOStreams() threadName=W-9000-model_1.0-stdout AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.898Z 2024-03-20T05:50:51,892 [INFO ] W-9000-model_1.0 org.pytorch.serve.wlm.WorkerThread - Retry worker: 9000 in 1 seconds. AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:51.898Z 2024-03-20T05:50:51,891 [INFO ] W-9000-model_1.0-stdout MODEL_LOG - File "/opt/conda/lib/python3.10/site-packages/ts/model_service_worker.py", line 221, in run_server AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:52.149Z 2024-03-20T05:50:51,893 [INFO ] W-9000-model_1.0-stdout org.pytorch.serve.wlm.WorkerLifeCycle - Stopped Scanner - W-9000-model_1.0-stdout AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:52.149Z 2024-03-20T05:50:51,914 [INFO ] W-9001-model_1.0-stdout MODEL_LOG - Backend worker process died. AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:52.149Z 2024-03-20T05:50:51,915 [INFO ] W-9001-model_1.0-stdout MODEL_LOG - Traceback (most recent call last): AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:52.149Z 2024-03-20T05:50:51,915 [INFO ] W-9001-model_1.0-stdout MODEL_LOG - File "/opt/conda/lib/python3.10/site-packages/ts/model_service_worker.py", line 253, in AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:52.149Z 2024-03-20T05:50:51,916 [INFO ] W-9001-model_1.0-stdout MODEL_LOG - worker.run_server() AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:52.149Z 2024-03-20T05:50:51,916 [INFO ] W-9001-model_1.0-stdout MODEL_LOG - File "/opt/conda/lib/python3.10/site-packages/ts/model_service_worker.py", line 221, in run_server AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:52.149Z 2024-03-20T05:50:51,915 [INFO ] epollEventLoopGroup-5-4 org.pytorch.serve.wlm.WorkerThread - 9001 Worker disconnected. WORKER_STARTED AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:52.149Z 2024-03-20T05:50:51,917 [WARN ] W-9001-model_1.0 org.pytorch.serve.wlm.BatchAggregator - Load model failed: model, error: Worker died.``

Installation instructions

Install torchserve from source: No it is an image through sagemaker
Docker: no

Model Packaing

aws/sagemaker-python-sdk#4502

config.properties

2024-03-20T05:50:45.547Z Torchserve version: 0.8.2 AllTraffic/i-0cb527de76ac4f29a
2024-03-20T05:50:45.547Z TS Home: /opt/conda/lib/python3.10/site-packages
2024-03-20T05:50:45.547Z Current directory: /
2024-03-20T05:50:45.547Z Temp directory: /home/model-server/tmp
2024-03-20T05:50:45.547Z Metrics config path: /opt/conda/lib/python3.10/site-packages/ts/configs/metrics.yaml
2024-03-20T05:50:45.547Z Number of GPUs: 0
2024-03-20T05:50:45.547Z Number of CPUs: 4
2024-03-20T05:50:45.547Z Max heap size: 4008 M
2024-03-20T05:50:45.547Z Python executable: /opt/conda/bin/python3.10
2024-03-20T05:50:45.547Z Config file: /etc/sagemaker-ts.properties
2024-03-20T05:50:45.547Z Inference address: http://0.0.0.0:8080
2024-03-20T05:50:45.547Z Management address: http://0.0.0.0:8080
2024-03-20T05:50:45.547Z Metrics address: http://127.0.0.1:8082
2024-03-20T05:50:45.547Z Model Store: /.sagemaker/ts/models
2024-03-20T05:50:45.547Z Initial Models: model=/opt/ml/model
2024-03-20T05:50:45.547Z Log dir: /logs
2024-03-20T05:50:45.547Z Metrics dir: /logs
2024-03-20T05:50:45.547Z Netty threads: 0
2024-03-20T05:50:45.547Z Netty client threads: 0
2024-03-20T05:50:45.547Z Default workers per model: 4
2024-03-20T05:50:45.547Z Blacklist Regex: N/A
2024-03-20T05:50:45.547Z Maximum Response Size: 6553500
2024-03-20T05:50:45.547Z Maximum Request Size: 6553500
2024-03-20T05:50:45.547Z Limit Maximum Image Pixels: true
2024-03-20T05:50:45.547Z Prefer direct buffer: false
2024-03-20T05:50:45.547Z Allowed Urls: [file://.|http(s)?://.]
2024-03-20T05:50:45.547Z Custom python dependency for model allowed: false

2024-03-20T05:50:45.547Z Torchserve version: 0.8.2 AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z TS Home: /opt/conda/lib/python3.10/site-packages AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Current directory: / AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Temp directory: /home/model-server/tmp AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Metrics config path: /opt/conda/lib/python3.10/site-packages/ts/configs/metrics.yaml AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Number of GPUs: 0 AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Number of CPUs: 4 AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Max heap size: 4008 M AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Python executable: /opt/conda/bin/python3.10 AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Config file: /etc/sagemaker-ts.properties AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Inference address: http://0.0.0.0:8080 AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Management address: http://0.0.0.0:8080 AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Metrics address: http://127.0.0.1:8082 AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Model Store: /.sagemaker/ts/models AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Initial Models: model=/opt/ml/model AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Log dir: /logs AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Metrics dir: /logs AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Netty threads: 0 AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Netty client threads: 0 AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Default workers per model: 4 AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Blacklist Regex: N/A AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Maximum Response Size: 6553500 AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Maximum Request Size: 6553500 AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Limit Maximum Image Pixels: true AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Prefer direct buffer: false AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Allowed Urls: [file://.|http(s)?://.] AllTraffic/i-0cb527de76ac4f29a

2024-03-20T05:50:45.547Z Custom python dependency for model allowed: false``

Versions

Torchserve version: 0.8.2
Python runtime: 3.10.9

Repro instructions

N/A

Possible Solution

No response

@namannandan
Copy link
Collaborator

namannandan commented Mar 20, 2024

@samruds, looking at the logs:

.....
.....
2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,532 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - File "/opt/ml/model/code/inference.py", line 10, in [AllTraffic/i-0cb527de76ac4f29a](https://us-west-2.console.aws.amazon.com/cloudwatch/home?region=us-west-2#logsV2:log-groups/log-group/$252Faws$252Fsagemaker$252FEndpoints$252Fpytorch-inference-2024-03-20-05-48-16-301/log-events//AllTraffic$252Fi-0cb527de76ac4f29a$3Fstart$3D1710913851548$26refEventId$3D38154653862238670871230805536070088255722857549318979634)

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,532 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - from sagemaker.serve.validations.check_integrity import perform_integrity_check [AllTraffic/i-0cb527de76ac4f29a](https://us-west-2.console.aws.amazon.com/cloudwatch/home?region=us-west-2#logsV2:log-groups/log-group/$252Faws$252Fsagemaker$252FEndpoints$252Fpytorch-inference-2024-03-20-05-48-16-301/log-events//AllTraffic$252Fi-0cb527de76ac4f29a$3Fstart$3D1710913851548$26refEventId$3D38154653862238670871230805536070088255722857549318979635)

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,532 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - File "/opt/conda/lib/python3.10/site-packages/sagemaker/serve/init.py", line 5, in [AllTraffic/i-0cb527de76ac4f29a](https://us-west-2.console.aws.amazon.com/cloudwatch/home?region=us-west-2#logsV2:log-groups/log-group/$252Faws$252Fsagemaker$252FEndpoints$252Fpytorch-inference-2024-03-20-05-48-16-301/log-events//AllTraffic$252Fi-0cb527de76ac4f29a$3Fstart$3D1710913851548$26refEventId$3D38154653862238670871230805536070088255722857549318979636)

2024-03-20T05:50:51.648Z 2024-03-20T05:50:51,533 [INFO ] W-9002-model_1.0-stdout MODEL_LOG - from sagemaker.serve.builder.model_builder import ModelBuilder [AllTraffic/i-0cb527de76ac4f29a](https://us-west-2.console.aws.amazon.com/cloudwatch/home?region=us-west-2#logsV2:log-groups/log-group/$252Faws$252Fsagemaker$252FEndpoints$252Fpytorch-inference-2024-03-20-05-48-16-301/log-events//AllTraffic$252Fi-0cb527de76ac4f29a$3Fstart$3D1710913851548$26refEventId$3D38154653862238670871230805536070088255722857549318979637)
.....
.....

It seems like the model load failed because:

  1. Import failed because of absence of package or presence of incorrect version of the package
  2. All the artifacts required to load the model were not provided with the model tar archive

A few follow up questions to help debug further:

  1. Could you share your inference.py implementation?
  2. How is the model tar archive created and what are its contents?
  3. How are package dependencies of inference.py if any being installed?

@namannandan namannandan added triaged Issue has been reviewed and triaged debugging sagemaker and removed triaged Issue has been reviewed and triaged labels Mar 20, 2024
@samruds
Copy link
Author

samruds commented Mar 20, 2024

Could you share your inference.py implementation?

`"""This module is for SageMaker inference.py."""
from future import absolute_import
import os
import io
import cloudpickle
import shutil
import platform
from pathlib import Path
from functools import partial
from sagemaker.serve.validations.check_integrity import perform_integrity_check
from sagemaker.serve.spec.inference_spec import InferenceSpec
from sagemaker.serve.detector.image_detector import _detect_framework_and_version, _get_model_base
from sagemaker.serve.detector.pickler import load_xgboost_from_json
import logging

logger = logging.getLogger(name)

inference_spec = None
native_model = None
schema_builder = None

def model_fn(model_dir):
"""Placeholder docstring"""
shared_libs_path = Path(model_dir + "/shared_libs")

if shared_libs_path.exists():
    # before importing, place dynamic linked libraries in shared lib path
    shutil.copytree(shared_libs_path, "/lib", dirs_exist_ok=True)

serve_path = Path(__file__).parent.joinpath("serve.pkl")
with open(str(serve_path), mode="rb") as file:
    global inference_spec, native_model, schema_builder
    obj = cloudpickle.load(file)
    if isinstance(obj[0], InferenceSpec):
        inference_spec, schema_builder = obj
    elif isinstance(obj[0], str) and obj[0] == "xgboost":
        model_class_name = os.getenv("MODEL_CLASS_NAME")
        model_save_path = Path(__file__).parent.joinpath("model.json")
        native_model = load_xgboost_from_json(
            model_save_path=str(model_save_path), class_name=model_class_name
        )
        schema_builder = obj[1]
    else:
        native_model, schema_builder = obj
if native_model:
    framework, _ = _detect_framework_and_version(
        model_base=str(_get_model_base(model=native_model))
    )
    if framework == "pytorch":
        native_model.eval()
    return native_model if callable(native_model) else native_model.predict
elif inference_spec:
    return partial(inference_spec.invoke, model=inference_spec.load(model_dir))

def input_fn(input_data, content_type):
"""Placeholder docstring"""
try:
if hasattr(schema_builder, "custom_input_translator"):
return schema_builder.custom_input_translator.deserialize(
io.BytesIO(input_data), content_type
)
else:
return schema_builder.input_deserializer.deserialize(
io.BytesIO(input_data), content_type[0]
)
except Exception as e:
raise Exception("Encountered error in deserialize_request.") from e

def predict_fn(input_data, predict_callable):
"""Placeholder docstring"""
return predict_callable(input_data)

def output_fn(predictions, accept_type):
"""Placeholder docstring"""
try:
if hasattr(schema_builder, "custom_output_translator"):
return schema_builder.custom_output_translator.serialize(predictions, accept_type)
else:
return schema_builder.output_serializer.serialize(predictions)
except Exception as e:
logger.error("Encountered error: %s in serialize_response." % e)
raise Exception("Encountered error in serialize_response.") from e

def _run_preflight_diagnostics():
_py_vs_parity_check()
_pickle_file_integrity_check()

def _py_vs_parity_check():
container_py_vs = platform.python_version()
local_py_vs = os.getenv("LOCAL_PYTHON")

if not local_py_vs or container_py_vs.split(".")[1] != local_py_vs.split(".")[1]:
    logger.warning(
        f"The local python version {local_py_vs} differs from the python version "
        f"{container_py_vs} on the container. Please align the two to avoid unexpected behavior"
    )

def _pickle_file_integrity_check():
with open("/opt/ml/model/code/serve.pkl", "rb") as f:
buffer = f.read()

metadeata_path = Path("/opt/ml/model/code/metadata.json")
perform_integrity_check(buffer=buffer, metadata_path=metadeata_path)

on import, execute

_run_preflight_diagnostics()`

@samruds
Copy link
Author

samruds commented Mar 20, 2024

  1. It is built and deployed using an interface in a SDK https://github.com/aws/sagemaker-python-sdk/blob/345381e1336c3188b26753fb21219484fc91cc8e/src/sagemaker/serve/builder/model_builder.py#L79
  2. We install the needed dependencies at the time of install. These are the approved images we use for containers https://github.com/aws/deep-learning-containers/blob/master/available_images.md

@lxning lxning added this to the v0.10.1 milestone Mar 26, 2024
@lxning lxning added this to Backlog in v0.11.0 lifecycle Mar 26, 2024
@samruds samruds closed this as completed Apr 1, 2024
@samruds
Copy link
Author

samruds commented Apr 1, 2024

Issue was resolved on our end by bringing extra dependencies for pytorch servers.

@namannandan namannandan moved this from Backlog to Done in v0.11.0 lifecycle Apr 26, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
debugging sagemaker triaged Issue has been reviewed and triaged
Projects
Development

No branches or pull requests

3 participants